In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from scipy import stats

In [None]:
df = pd.read_csv('/kaggle/input/customer-analytics/Train.csv')
df = df.drop(['ID'],axis=1)

df.columns = ['ware_block','mode_ship','cust_call','cust_rating','product_cost','prior_purchase','product_impt','gender','discount','weight','not_ontime_delivery']

df_num = ['cust_call','cust_rating','product_cost','prior_purchase','discount','weight']
df_cat = ['ware_block','mode_ship','gender']

heatmap_corr = df.corr()
plt.figure(figsize = (10,8))
sns.heatmap(heatmap_corr,annot=True)
plt.show()

**Korelasi feature dengan variabel target**
- Discount memiliki korelasi (positif) paling tinggi dengan not_ontime delivery dengan koefesien korelasi 0.4
- Kedua berat barang atau weight memiliki korelasi (negatif) tertinggi kedua dengan koefesien korelasi -0.27
- Ke empat variabel lain product cost (-0.074), customer care call (-0.067), priority purchase (-0.056) dan customer rating (0.013) memiliki korelasi yang cenderung rendah dibawah 0.1

## Feature Engineering
By **Syahrul Ilyasa**

### Data Outliers

In [None]:
#Data include outlier

plt.figure(figsize = (12,4))
for i in range(0, len(df_num)):
    plt.subplot(1, 6, i+1)
    sns.boxplot(y = df[df_num[i]], orient='v')
    plt.tight_layout()

In [None]:
#Handling outliers

dfx = df.copy()
dfxx = df.copy()

print(f'Jumlah baris sebelum memfilter outlier: {len(dfxx)}')

fil_ent = np.array([True] * len(dfxx))
for col in ['product_cost', 'discount','weight']:
    Q1 = dfxx[col].quantile(0.25)
    Q3 = dfxx[col].quantile(0.75)
    IQR = Q3 - Q1
    low_limit = Q1 - (IQR * 1.5)
    high_limit = Q3 + (IQR * 1.5)

    fil_ent = ((dfxx[col] >= low_limit) & (dfxx[col] <= high_limit)) & fil_ent
    
dfxx = dfxx[fil_ent].reset_index()

print('Jumlah baris setelah memfilter outlier', len(dfxx))

In [None]:
#Data setelah oulier dikeluarkan
plt.figure(figsize = (12,4))
for i in range(0, len(df_num)):
    plt.subplot(1, 6, i+1)
    sns.boxplot(y = dfx[df_num[i]], orient='v')
    plt.tight_layout()

### Normalisasi Data

In [None]:
# Cek data sebelum di normalisasi
plt.figure(figsize = (12,5))
for i in range(0, len(df_num)):
    plt.subplot(2, 3, i+1)
    sns.histplot(dfx[df_num[i]], kde=True)
    plt.tight_layout()

In [None]:
#Normalisasi data
from sklearn.preprocessing import MinMaxScaler, StandardScaler
dfx['product_cost_norm'] = MinMaxScaler().fit_transform(dfx['product_cost'].values.reshape(len(dfx), 1))
dfx['discount_norm'] = MinMaxScaler().fit_transform(dfx['discount'].values.reshape(len(dfx), 1))
dfx['weight_norm'] = MinMaxScaler().fit_transform(dfx['weight'].values.reshape(len(dfx), 1))

In [None]:
#Cek data setelah dinormalisasi
df_norm = ['product_cost_norm','discount_norm','weight_norm']

plt.figure(figsize = (12,5))
for i in range(0, len(df_norm)):
    plt.subplot(2, 4, i+1)
    sns.histplot(dfx[df_norm[i]], kde=True)
    plt.tight_layout()

In [None]:
#Drop variabel yang sudah di normalisasi
dfx.drop(['product_cost','discount','weight'], axis=1, inplace=True)

dfx.sample(5)

In [None]:
#labelling pada product importance
def product_impt(x):
    if 'low' in x['product_impt']:
        product_impt = 1
    elif 'medium' in x['product_impt']:
        product_impt = 2
    else:
        product_impt = 3
    return product_impt

dfx['product_impt'] = df.apply(lambda x: product_impt(x), axis=1)
dfx

## Feature Encoding

In [None]:
# Feature encoding
for cat in df_cat:
    onehots = pd.get_dummies(dfx[cat], prefix=cat)
    dfx = dfx.join(onehots)
dfx.info()

# Parameter n-1 feature

In [None]:
#Drop feature awal yang sudah masuk proses feature encoding
dfx.drop(['ware_block','mode_ship','gender','gender_F'], axis=1, inplace=True)

In [None]:
dfx.sample(5)

## Bagian 1 - Decision Tree (Model+Turning)

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

def eval_classification(model, pred, xtrain, ytrain, xtest, ytest):
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, pred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, pred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, pred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, pred))
    
    fpr, tpr, thresholds = roc_curve(ytest, pred, pos_label=1) # pos_label: label yang kita anggap positive
    print("AUC: %.2f" % auc(fpr, tpr))

In [None]:
X = dfx.drop(columns=['not_ontime_delivery'])
y = dfx['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np

# List of hyperparameter
max_depth = [int(x) for x in np.linspace(1, 20, num = 20)] # Maximum number of levels in tree
criterion = ['gini','entropy']
splitter = ['best','random']
min_samples_split = [int(x) for x in np.linspace(1, 110, num = 110)] # Minimum number of samples required to split a node
min_samples_leaf = [int(x) for x in np.linspace(1, 1100, num = 1100)] # Minimum number of samples required at each leaf node
max_features = ['auto','sqrt','log2'] # Number of features to consider at every split

hyperparameters = dict(max_depth=max_depth,
                       criterion=criterion,
                       splitter=splitter,
                       min_samples_split=min_samples_split, 
                       min_samples_leaf=min_samples_leaf,
                       max_features=max_features,
                      )

# Inisialisasi Model
dt = DecisionTreeClassifier(random_state=42)
model = RandomizedSearchCV(dt, hyperparameters, cv=5, random_state=42, scoring='recall')
model.fit(X_train, y_train)

# Predict & Evaluation
y_pred = model.predict(X_test)#Check performa dari model
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

In [None]:
print('Best max_depth:', model.best_estimator_.get_params()['max_depth'])
print('Best Criterion:', model.best_estimator_.get_params()['criterion'])
print('Best Splitter:', model.best_estimator_.get_params()['splitter'])
print('Best min_samples_split:', model.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', model.best_estimator_.get_params()['min_samples_leaf'])
print('Best max_features:', model.best_estimator_.get_params()['max_features'])

In [None]:
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(20, 10))
tree.plot_tree(model.best_estimator_,
               feature_names = X.columns.tolist(), 
               class_names=['0','1'],
               filled = True, max_depth=5, fontsize=8)

plt.show()

In [None]:
# plt.figsize(10, 8)
feat_importances = pd.Series(model.best_estimator_.feature_importances_, index=X.columns)
ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
ax.invert_yaxis()

plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')

### Bagian 2 - Decision Tree (Feature Selection+Turning)

In [None]:
X = dfx.drop(columns=['not_ontime_delivery','ware_block_C','mode_ship_Road','gender_M'])
y = dfx['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np

# List of hyperparameter
max_depth = [int(x) for x in np.linspace(1, 20, num = 20)] # Maximum number of levels in tree
criterion = ['gini','entropy']
splitter = ['best','random']
min_samples_split = [int(x) for x in np.linspace(1, 90, num = 90)] # Minimum number of samples required to split a node
min_samples_leaf = [int(x) for x in np.linspace(1, 1100, num = 1100)] # Minimum number of samples required at each leaf node
max_features = ['auto','sqrt', 'log2'] # Number of features to consider at every split

hyperparameters = dict(max_depth=max_depth,
                       criterion=criterion,
                       splitter=splitter,
                       min_samples_split=min_samples_split, 
                       min_samples_leaf=min_samples_leaf,
                       max_features=max_features,
                      )

# Inisialisasi Model
dt = DecisionTreeClassifier(random_state=42)
model = RandomizedSearchCV(dt, hyperparameters, cv=5, random_state=42, scoring='recall')
model.fit(X_train, y_train)

# Predict & Evaluation
y_pred = model.predict(X_test)#Check performa dari model
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

In [None]:
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [None]:
print('Best max_depth:', model.best_estimator_.get_params()['max_depth'])
print('Best Criterion:', model.best_estimator_.get_params()['criterion'])
print('Best Splitter:', model.best_estimator_.get_params()['splitter'])
print('Best min_samples_split:', model.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', model.best_estimator_.get_params()['min_samples_leaf'])
print('Best max_features:', model.best_estimator_.get_params()['max_features'])

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(30, 10))
tree.plot_tree(model.best_estimator_,
               feature_names = X.columns.tolist(), 
               class_names=['0','1'],
               filled = True, max_depth=5, fontsize=10)

plt.show()

In [None]:
# plt.figsize(10, 8)
feat_importances = pd.Series(model.best_estimator_.feature_importances_, index=X.columns)
ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
ax.invert_yaxis()

plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')

### Bagian 3 - Decision Tree (Feature Selection 2 + Turning)

In [None]:
X = dfx.drop(columns=['not_ontime_delivery','ware_block_C','mode_ship_Road','gender_M','ware_block_D','ware_block_B','prior_purchase'])
y = dfx['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np

# List of hyperparameter
max_depth = [int(x) for x in np.linspace(1, 20, num = 20)] # Maximum number of levels in tree
criterion = ['gini','entropy']
splitter = ['best','random']
min_samples_split = [int(x) for x in np.linspace(1, 100, num = 100)] # Minimum number of samples required to split a node
min_samples_leaf = [int(x) for x in np.linspace(1, 1100, num = 1100)] # Minimum number of samples required at each leaf node
max_features = ['auto','sqrt', 'log2'] # Number of features to consider at every split

hyperparameters = dict(max_depth=max_depth,
                       criterion=criterion,
                       splitter=splitter,
                       min_samples_split=min_samples_split, 
                       min_samples_leaf=min_samples_leaf,
                       max_features=max_features,
                      )

# Inisialisasi Model
dt = DecisionTreeClassifier(random_state=42)
model = RandomizedSearchCV(dt, hyperparameters, cv=5, random_state=42, scoring='recall')
model.fit(X_train, y_train)

# Predict & Evaluation
y_pred = model.predict(X_test)#Check performa dari model
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

In [None]:
print('Best max_depth:', model.best_estimator_.get_params()['max_depth'])
print('Best Criterion:', model.best_estimator_.get_params()['criterion'])
print('Best Splitter:', model.best_estimator_.get_params()['splitter'])
print('Best min_samples_split:', model.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', model.best_estimator_.get_params()['min_samples_leaf'])
print('Best max_features:', model.best_estimator_.get_params()['max_features'])

In [None]:
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(30, 10))
tree.plot_tree(model.best_estimator_,
               feature_names = X.columns.tolist(), 
               class_names=['0','1'],
               filled = True, max_depth=7, fontsize=10)

plt.show()

In [None]:
# plt.figsize(10, 8)
feat_importances = pd.Series(model.best_estimator_.feature_importances_, index=X.columns)
ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
ax.invert_yaxis()

plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')

### Random Forest - Bagian 1

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

def eval_classification(model, pred, xtrain, ytrain, xtest, ytest):
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, pred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, pred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, pred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, pred))
    
    fpr, tpr, thresholds = roc_curve(ytest, pred, pos_label=1) # pos_label: label yang kita anggap positive
    print("AUC: %.2f" % auc(fpr, tpr))

def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

def show_best_hyperparameter(model, hyperparameters):
    for key, value in hyperparameters.items() :
        print('Best '+key+':', model.get_params()[key])


In [None]:
X = dfx.drop(columns=['not_ontime_delivery'])
y = dfx['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)
eval_classification(rf, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#List Hyperparameters yang akan diuji
hyperparameters = dict(
                       n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)], # Jumlah subtree 
                       bootstrap = [True], # Apakah pakai bootstrapping atau tidak
                       criterion = ['gini','entropy'],
                       max_depth = [int(x) for x in np.linspace(10, 110, num = 11)],  # Maximum kedalaman tree
                       min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)], # Jumlah minimum samples pada node agar boleh di split menjadi leaf baru
                       min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)], # Jumlah minimum samples pada leaf agar boleh terbentuk leaf baru
                       max_features = ['auto', 'sqrt', 'log2'], # Jumlah feature yg dipertimbangkan pada masing-masing split
                       n_jobs = [-1], # Core untuk parallel computation. -1 untuk menggunakan semua core
                      )

# Init
rf = RandomForestClassifier(random_state=42)
rf_tuned = RandomizedSearchCV(rf, hyperparameters, cv=5, random_state=42, scoring='recall')
rf_tuned.fit(X_train,y_train)

# Predict & Evaluation
y_pred = rf_tuned.predict(X_test)#Check performa dari model
eval_classification(rf_tuned, y_pred, X_train, y_train, X_test, y_test)

In [None]:
show_best_hyperparameter(rf_tuned.best_estimator_, hyperparameters)

### XGBOOST

In [None]:
X = dfx.drop(columns=['not_ontime_delivery'])
y = dfx['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from xgboost import XGBClassifier
xg = XGBClassifier(random_state=42)
xg.fit(X_train, y_train)

y_pred = xg.predict(X_test)
eval_classification(xg, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

#Menjadikan ke dalam bentuk dictionary
hyperparameters = {
                    'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
                    'min_child_weight' : [int(x) for x in np.linspace(1, 20, num = 11)],
                    'gamma' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'tree_method' : ['auto', 'exact', 'approx', 'hist'],

                    'colsample_bytree' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'eta' : [float(x) for x in np.linspace(0, 1, num = 100)],

                    'lambda' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'alpha' : [float(x) for x in np.linspace(0, 1, num = 11)]
                    }

# Init
xg = XGBClassifier(random_state=42)
xg_tuned = RandomizedSearchCV(xg, hyperparameters, cv=5, random_state=42, scoring='recall')
xg_tuned.fit(X_train,y_train)

# Predict & Evaluation
y_pred = xg_tuned.predict(X_test)#Check performa dari model
eval_classification(xg_tuned, y_pred, X_train, y_train, X_test, y_test)