In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from scipy import stats

In [None]:
df = pd.read_csv('/kaggle/input/customer-analytics/Train.csv')
df = df.drop(['ID'],axis=1)

df.columns = ['ware_block','mode_ship','cust_call','cust_rating','product_cost','prior_purchase','product_impt','gender','discount','weight','not_ontime_delivery']

df_num = ['cust_call','cust_rating','product_cost','prior_purchase','discount','weight']
df_cat = ['ware_block','mode_ship','gender']

#Normalisasi data
from sklearn.preprocessing import MinMaxScaler, StandardScaler
df['cust_call'] = StandardScaler().fit_transform(df['cust_call'].values.reshape(len(df), 1))
df['cust_rating'] = StandardScaler().fit_transform(df['cust_rating'].values.reshape(len(df), 1))
df['prior_purchase'] = StandardScaler().fit_transform(df['prior_purchase'].values.reshape(len(df), 1))

df['product_cost_norm'] = MinMaxScaler().fit_transform(df['product_cost'].values.reshape(len(df), 1))
df['discount_norm'] = MinMaxScaler().fit_transform(df['discount'].values.reshape(len(df), 1))
df['weight_norm'] = MinMaxScaler().fit_transform(df['weight'].values.reshape(len(df), 1))
df.drop(['product_cost','discount','weight'], axis=1, inplace=True)

#labelling pada product importance
def product_impt(x):
    if 'low' in x['product_impt']:
        product_impt = 1
    elif 'medium' in x['product_impt']:
        product_impt = 2
    else:
        product_impt = 3
    return product_impt
df['product_impt'] = df.apply(lambda x: product_impt(x), axis=1)

# Feature encoding
for cat in df_cat:
    onehots = pd.get_dummies(df[cat], prefix=cat)
    df = df.join(onehots)
#Drop feature awal yang sudah masuk proses feature encoding
df.drop(['ware_block','mode_ship','gender','gender_F'], axis=1, inplace=True)

X = df[[col for col in df.columns if (str(df[col].dtype) != 'object') and col not in ['not_ontime_delivery']]]
y = df['not_ontime_delivery'].values
from imblearn import under_sampling, over_sampling
X_under, y_under = under_sampling.RandomUnderSampler(0.9).fit_resample(X, y)
X_over, y_over = over_sampling.RandomOverSampler(0.9).fit_resample(X, y)
X_over_SMOTE, y_over_SMOTE = over_sampling.SMOTE().fit_resample(X, y)
print(pd.Series(y_under).value_counts())
print(pd.Series(y_over).value_counts())
print(pd.Series(y_over_SMOTE).value_counts())

In [None]:
df.to_csv('mycsvfile.csv',index=True)

## Decision Tree 1

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

X = df.drop(columns=['not_ontime_delivery'])
y = df['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

def eval_classification(model, pred, xtrain, ytrain, xtest, ytest):
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, pred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, pred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, pred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, pred))
    
    fpr, tpr, thresholds = roc_curve(ytest, pred, pos_label=1) # pos_label: label yang kita anggap positive
    print("AUC: %.2f" % auc(fpr, tpr))
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

## Decision Tree 2

In [None]:
X = df.drop(columns=['not_ontime_delivery','ware_block_C','mode_ship_Road','gender_M'])
y = df['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

## Decision Tree 3

In [None]:
X = df.drop(columns=['not_ontime_delivery','ware_block_C','mode_ship_Road','gender_M','ware_block_D','ware_block_B','prior_purchase'])
y = df['not_ontime_delivery'] # target / label

#Splitting the data into Train and Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

#### Dari ketiga jenis Decision Tree yang paling bagus parameter adalah model 3, oleh karena itu kami memutuskan untuk membuat hyperparameter turning untuk memaksimalkan modelnya

## Turning Hyperparameter model 3

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# List of hyperparameter
max_depth = [int(x) for x in np.linspace(1, 20, num = 20)] # Maximum number of levels in tree
criterion = ['gini','entropy']
splitter = ['best','random']
min_samples_split = [int(x) for x in np.linspace(1, 100, num = 100)] # Minimum number of samples required to split a node
min_samples_leaf = [int(x) for x in np.linspace(1, 1100, num = 1100)] # Minimum number of samples required at each leaf node
max_features = ['auto','sqrt', 'log2'] # Number of features to consider at every split

hyperparameters = dict(max_depth=max_depth,
                       criterion=criterion,
                       splitter=splitter,
                       min_samples_split=min_samples_split, 
                       min_samples_leaf=min_samples_leaf,
                       max_features=max_features,
                      )

# Inisialisasi Model
dt = DecisionTreeClassifier(random_state=42)
model = RandomizedSearchCV(dt, hyperparameters, cv=5, random_state=42, scoring='recall')
model.fit(X_train, y_train)

# Predict & Evaluation
y_pred = model.predict(X_test)#Check performa dari model
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

print('\nBest max_depth:', model.best_estimator_.get_params()['max_depth'])
print('Best Criterion:', model.best_estimator_.get_params()['criterion'])
print('Best Splitter:', model.best_estimator_.get_params()['splitter'])
print('Best min_samples_split:', model.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', model.best_estimator_.get_params()['min_samples_leaf'])
print('Best max_features:', model.best_estimator_.get_params()['max_features'])

print('\nUnderfitting/overfitting check')
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))


**Hasil check** : Model termasuk **fit**, karena train score dan test score mempunyai akurasi yang hampir sama!

In [None]:
#Tree Check
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(30, 10))
tree.plot_tree(model.best_estimator_,
               feature_names = X.columns.tolist(), 
               class_names=['0','1'],
               filled = True, max_depth=7, fontsize=12)

plt.show()

In [None]:
#Check Feature Importance
feat_importances = pd.Series(model.best_estimator_.feature_importances_, index=X.columns)
ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
ax.invert_yaxis()

plt.xlabel('score')
plt.ylabel('feature')
plt.title('feature importance score')

**Conclusion:** Model dari algoritma Decision tree menghasilkan recall sebesar 98%. Artinya barang 98% diprediksi akan datang terlambat berkaitan dengan variabel:
- Berat barang yang dikirim (weight)
- Harga barang (Product cost)
- Tingkat kepentingan barang (Product importance)
- Barang yang berasal dari gudang block A (warehouse block A)
- Jenis pengiriman menggunakan kapal dan pesawat

# Cek Model Lainya

## Random Forest

In [None]:
## Random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)

y_pred = rf.predict(X_test)
eval_classification(rf, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#List Hyperparameters yang akan diuji
hyperparameters = dict(
                       n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)], # Jumlah subtree 
                       bootstrap = [True], # Apakah pakai bootstrapping atau tidak
                       criterion = ['gini','entropy'],
                       max_depth = [int(x) for x in np.linspace(10, 110, num = 11)],  # Maximum kedalaman tree
                       min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)], # Jumlah minimum samples pada node agar boleh di split menjadi leaf baru
                       min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)], # Jumlah minimum samples pada leaf agar boleh terbentuk leaf baru
                       max_features = ['auto', 'sqrt', 'log2'], # Jumlah feature yg dipertimbangkan pada masing-masing split
                       n_jobs = [-1], # Core untuk parallel computation. -1 untuk menggunakan semua core
                      )

# Init
rf = RandomForestClassifier(random_state=42)
rf_tuned = RandomizedSearchCV(rf, hyperparameters, cv=5, random_state=42, scoring='recall')
rf_tuned.fit(X_train,y_train)

# Predict & Evaluation
y_pred = rf_tuned.predict(X_test)#Check performa dari model
eval_classification(rf_tuned, y_pred, X_train, y_train, X_test, y_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Original Logistic Regression:')
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train,y_train)

y_pred = ab.predict(X_test)
eval_classification(ab, y_pred, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# List of hyperparameter
hyperparameters = dict(n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)], # Jumlah iterasi
                       learning_rate = [float(x) for x in np.linspace(start = 0.001, stop = 0.1, num = 20)],  
                       algorithm = ['SAMME', 'SAMME.R']
                      )

# Init model
ab = AdaBoostClassifier(random_state=42)
ab_tuned = RandomizedSearchCV(ab, hyperparameters, random_state=42, cv=5, scoring='recall')
ab_tuned.fit(X_train,y_train)

# Predict & Evaluation
y_pred = ab_tuned.predict(X_test)#Check performa dari model
eval_classification(ab_tuned, y_pred, X_train, y_train, X_test, y_test)

## Boosting: XGBoost

In [None]:
from xgboost import XGBClassifier
xg = XGBClassifier(random_state=42)
xg.fit(X_train, y_train)

y_pred = xg.predict(X_test)
eval_classification(xg, y_pred, X_train, y_train, X_test, y_test)

In [None]:
# List of hyperparameter
hyperparameters = {
                    'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
                    'min_child_weight' : [int(x) for x in np.linspace(1, 20, num = 11)],
                    'gamma' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'tree_method' : ['auto', 'exact', 'approx', 'hist'],

                    'colsample_bytree' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'eta' : [float(x) for x in np.linspace(0, 1, num = 100)],

                    'lambda' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'alpha' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    
                    'verbosity': [0] # add this line to slient warning message
                }

# Init
xg = XGBClassifier(random_state=42)
xg_tuned = RandomizedSearchCV(xg, hyperparameters, cv=5, random_state=42, scoring='recall')
xg_tuned.fit(X_train,y_train)

# Predict & Evaluation
y_pred = xg_tuned.predict(X_test)#Check performa dari model
eval_classification(xg_tuned, y_pred, X_train, y_train, X_test, y_test)