In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, f1_score,roc_curve, precision_score, recall_score,roc_auc_score
from sklearn import linear_model, tree, ensemble
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter(action="ignore")

In [None]:
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    return df

In [8]:
class Preprocessing:
    def __init__(self, df):
        self.df = df
        
    def shape(self):
        print(f'shape: {self.df.shape}')
    
    def dtypes(self, pr=False):
        print("Types")
        if pr:
            print(self.df.dtypes)
            
    def supposed2beint(self):
        float_cols = [column for column in self.df.columns if self.df[column].dtype == 'float']
        int2be_cols = []
        for col in float_cols:
            if (self.df[col].fillna(-9999) % 1  == 0).all() == True:
                int2be_cols.append(col)
        return int2be_cols
                
    def isNaN(self, pr=False):
        if pr:
            print("Contain NaN")
            print(self.df.isnull().sum())
        else:
            return self.df.columns[self.df.isna().any()].tolist()
    
    def isObject(self):
        return [column for column in self.df.columns if self.df[column].dtype == 'object']
        
    def check_dataframe(self):
        self.shape()
        self.dtypes(True)
        self.isNaN(True)
        
    def fillNaN(self):
        nan_cols = self.isNaN()
        int2be_cols = self.supposed2beint()
        # nan_cols == int2be_cols they are the same
        for col in int2be_cols:
            self.df[col].fillna(round(self.df[col].mean()), inplace=True)
            self.df[col] = self.df[col].astype(int)
            
    def adjust_category_cols(self):
        self.fillNaN()
        #PreferredLoginDevice
        self.df.loc[self.df["PreferredLoginDevice"] == "Mobile Phone", "PreferredLoginDevice"] = "Phone"
        #PreferredPaymentMode
        self.df.loc[self.df["PreferredPaymentMode"] == "Credit Card", "PreferredPaymentMode"] = "CC"
        self.df.loc[self.df["PreferredPaymentMode"] == "Cash on Delivery", "PreferredPaymentMode"] = "COD"
        #PreferredLoginDevice
        self.df.loc[self.df["PreferredLoginDevice"] == "Mobile Phone", "PreferredLoginDevice"] = "Phone"
        
    def drop_useless_cols(self):
        self.df.drop(['CustomerID'], axis=1, inplace=True)
        
    def split_target(self):
        self.adjust_category_cols()
        self.drop_useless_cols()
        self.X = self.df.drop('Churn', axis=1)
        self.y = self.df['Churn'].astype(int).to_numpy()
        
    def find_enc_method(self):
        cat_cols = self.isObject()
        one_hot_cols = [col for col in cat_cols if self.X[col].nunique() <=3]
        label_enc_cols = [col for col in cat_cols if col not in one_hot_cols]
        return one_hot_cols, label_enc_cols, cat_cols
    
    def encoding(self):
        one_hot_cols, label_enc_cols, cat_cols = self.find_enc_method()
        num_cols = [col for col in self.X.columns if col not in cat_cols]
        X_OHE, X_LE, X_NUM = self.X[one_hot_cols].copy(), self.X[label_enc_cols].copy(), self.X[num_cols].copy()
        self.OHE = OneHotEncoder(drop='first', handle_unknown='error')
        X_OHE = self.OHE.fit_transform(X_OHE).toarray()
        self.le_dict = {}
        self.LE = LabelEncoder()
#         X_LE[label_enc_cols] = X_LE[label_enc_cols].apply(lambda col: self.LE.fit_transform(col))   
        for col in X_LE.columns:
            self.le_dict[col] = self.LE.fit(X_LE[col])
            X_LE[col] = self.le_dict[col].transform(X_LE[col])
        return X_OHE, X_LE.to_numpy(), X_NUM.to_numpy()

    def scaling(self):
        X_OHE, X_LE, X_num = self.encoding()
        self.SS = StandardScaler()
        X_num = self.SS.fit_transform(X_num)
        self.X_total = np.concatenate((X_OHE, X_LE, X_num), axis=1)
#         self.X_total = self.SS.fit_transform(self.X_total)
        
    def get_encoders(self):
        return self.OHE, self.LE, self.le_dict
    
    def get_scaler(self):
        return self.SS   

    def get_default_Xy(self):
        self.split_target()
        return self.X, self.y
      
    def get_Xy(self):
        self.split_target()
        self.scaling()
        return self.X_total, self.y

df = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='E Comm')
data = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='Data Dict')
pre = Preprocessing(df)
X, y = pre.get_Xy()
unique, counts = np.unique(y, return_counts=True)
unique, counts

(array([0, 1]), array([4682,  948]))

In [6]:
from imblearn.over_sampling import SMOTE

def apply_smote(X, y, random_state=None):
    """
    Applies SMOTE to the input features (X) and target variable (y) to balance the dataset.
    
    Parameters:
    X: numpy array or pandas DataFrame with the input features
    y: numpy array or pandas Series with the target variable
    random_state: int, default=None, controls the randomness of the SMOTE algorithm
    
    Returns:
    X_resampled: numpy array with the resampled input features
    y_resampled: numpy array with the resampled target variable
    """
    smote = SMOTE(random_state=random_state)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

X_resampled, y_resampled = apply_smote(X, y, random_state=42)
print(X_resampled.shape)
print(y_resampled.shape)

(9364, 19)
(9364,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
model = linear_model.LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
CR = classification_report(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("="*60)
print(f"Accuracy: {accuracy:.4f}")
print("="*60)
print(f"Classification Report:\n {CR}")
print("="*60)
print(f"Confusion Matrix: {CM}")
print("="*60)
print(f"F1 Score: {f1:.4f}")
print("="*60)
print(f"ROC AUC Score: {roc_auc:.4f}")
print("="*60)

In [None]:
import optuna
from xgboost import XGBClassifier
def objective_xgb(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_categorical("n_estimators", [150, 200, 300, 3000]),
        'max_depth': trial.suggest_categorical('max_depth', [4,5,7,9,11,13,15,17]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = XGBClassifier(**param)
    model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_test, y_test)], verbose=False)   
    preds = model.predict(X_test)
    
    acc = accuracy_score(y_test, preds)      

    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb, n_trials=50)
params_xgb = study.best_trial.params
print('Number of finished trials:', len(study.trials))
print('Best trial:', params_xgb)

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
model = XGBClassifier(**params_xgb)
model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_test, y_test)], verbose=False)   
y_pred = model.predict(X_test)
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 2)}")
print("="*60)
print(f"Recall: {round(recall_score(y_pred,y_test),2)}")
print("="*60)
print(f"Precision: {round(precision_score(y_pred,y_test), 2)}")
print("="*60)
print(f"F1: {round(f1_score(y_pred,y_test), 2)}")
print("="*60)
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 2)}")
print("="*60)
CR = classification_report(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
print(f"Classification Report:\n {CR}")
print("="*60)
print(f"Confusion Matrix: {CM}")
print("="*60)

In [9]:
import optuna
from lightgbm import LGBMClassifier
def objective_lgbm(trial):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
    param = {
        'random_state': 42,
        'n_estimators': trial.suggest_categorical("n_estimators", [150, 200, 300, 3000]),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = LGBMClassifier(**param)
    model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)], verbose=False)    
    preds = model.predict(X_test)
    
    acc = accuracy_score(y_test, preds)      

    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective_lgbm, n_trials=50)
params_lgbm = study.best_trial.params
print('Number of finished trials:', len(study.trials))
print('Best trial:', params_lgbm)

[32m[I 2023-03-29 17:27:18,884][0m A new study created in memory with name: no-name-350ed4d5-55ec-47fb-99ec-07015b3b75b6[0m
[32m[I 2023-03-29 17:27:20,333][0m Trial 0 finished with value: 0.9313167259786477 and parameters: {'n_estimators': 150, 'reg_alpha': 1.1544719515821524, 'reg_lambda': 0.658156610034278, 'colsample_bytree': 0.7, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 10, 'num_leaves': 835, 'min_child_samples': 103, 'min_data_per_groups': 72}. Best is trial 0 with value: 0.9313167259786477.[0m
[32m[I 2023-03-29 17:27:52,713][0m Trial 1 finished with value: 0.9879003558718861 and parameters: {'n_estimators': 3000, 'reg_alpha': 0.02206757662063572, 'reg_lambda': 3.2974256213878115, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 100, 'num_leaves': 684, 'min_child_samples': 95, 'min_data_per_groups': 62}. Best is trial 1 with value: 0.9879003558718861.[0m
[32m[I 2023-03-29 17:27:54,308][0m Trial 2 finished with value: 0.88718861

Number of finished trials: 50
Best trial: {'n_estimators': 3000, 'reg_alpha': 0.009102095916152044, 'reg_lambda': 0.21111366955566466, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.014, 'max_depth': 20, 'num_leaves': 512, 'min_child_samples': 127, 'min_data_per_groups': 90}


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y)
model = LGBMClassifier(**params_lgbm)
model.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_test, y_test)], verbose=False)   
y_pred = model.predict(X_test)
print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 2)}")
print("="*60)
print(f"Recall: {round(recall_score(y_pred,y_test),2)}")
print("="*60)
print(f"Precision: {round(precision_score(y_pred,y_test), 2)}")
print("="*60)
print(f"F1: {round(f1_score(y_pred,y_test), 2)}")
print("="*60)
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 2)}")
print("="*60)
CR = classification_report(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
print(f"Classification Report:\n {CR}")
print("="*60)
print(f"Confusion Matrix: {CM}")
print("="*60)

Accuracy: 0.99
Recall: 0.98
Precision: 0.99
F1: 0.99
Auc: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      1401
           1       0.98      0.99      0.99      1409

    accuracy                           0.99      2810
   macro avg       0.99      0.99      0.99      2810
weighted avg       0.99      0.99      0.99      2810

Confusion Matrix: [[1375   26]
 [   9 1400]]


In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cnt = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kf.split(X, y):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    cnt+=1

score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42), X, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

In [None]:
from catboost import CatBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

catboost_model = CatBoostClassifier(verbose=False, random_state=42).fit(X_train, y_train)
y_pred = catboost_model.predict(X_test)

print(f"Accuracy: {round(accuracy_score(y_pred, y_test), 2)}")
print("="*60)
print(f"Recall: {round(recall_score(y_pred,y_test),2)}")
print("="*60)
print(f"Precision: {round(precision_score(y_pred,y_test), 2)}")
print("="*60)
print(f"F1: {round(f1_score(y_pred,y_test), 2)}")
print("="*60)
print(f"Auc: {round(roc_auc_score(y_pred,y_test), 2)}")
print("="*60)
CR = classification_report(y_test, y_pred)
CM = confusion_matrix(y_test, y_pred)
print(f"Classification Report:\n {CR}")
print("="*60)
print(f"Confusion Matrix: {CM}")
print("="*60)