### Imports

In [2]:
from datetime import datetime
import os, sys
import copy

import torch
import catboost

import sklearn
from sklearn.model_selection import train_test_split

# add my own file with custom utility functions as a module
utils_path = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir)))
if utils_path not in sys.path:
    sys.path.append(utils_path)

import aku_utils as ak

import pandas as pd
import numpy as np

# pandas options
pd.options.display.max_columns = 100
pd.options.display.max_rows =  200
# pd.options.display.max_info_rows = 1690785
pd.options.display.max_info_columns = 200
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.date_dayfirst = True
pd.options.mode.chained_assignment = None

### Data overview

In [3]:
df = pd.read_csv(os.path.join(os.path.dirname('__file__'), os.path.pardir, 'data', 'telco_processed_2.csv'))
df.head(-5)

Unnamed: 0,gender,age,married,number_of_dependents,latitude,longitude,number_of_referrals,phone_service,multiple_lines,avg_monthly_gb_download,online_security,online_backup,device_protection_plan,premium_tech_support,streaming_tv,unlimited_data,paperless_billing,total_refunds,total_extra_data_charges,total_long_distance_charges,satisfaction_score,churn,cltv,corr_total_charges,contract_one_year,contract_two_year,payment_method_credit_card,payment_method_mailed_check,offer_offer_b,offer_offer_c,offer_offer_d,offer_offer_e,offer_nan,internet_type_dsl,internet_type_fiber_optic,internet_type_nan,streaming_music_or_movies
0,1,78,0,0,34.02,-118.16,0,0,0,8,0,0,1,0,0,0,1,0.00,0.50,0.00,3,1,5433,19.82,0,0,0,0,0,0,0,0,1,1,0,0,1
1,0,74,1,1,34.04,-118.19,1,1,1,17,0,1,0,0,0,1,1,0.00,0.00,0.62,3,1,5302,126.66,0,0,1,0,0,0,0,1,0,0,1,0,0
2,1,71,0,3,34.11,-118.23,0,1,1,52,0,0,0,0,1,1,1,0.03,0.00,0.12,2,1,3179,438.14,0,0,0,0,0,0,1,0,0,0,1,0,1
3,0,78,1,1,33.94,-118.33,1,1,0,12,0,1,1,0,1,1,1,0.01,0.00,0.20,2,1,5337,502.90,0,0,0,0,0,1,0,0,0,0,1,0,1
4,0,80,1,1,33.97,-118.02,1,1,1,14,0,0,0,0,0,1,1,0.00,0.00,0.08,2,1,2793,717.04,0,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7033,0,23,1,0,33.28,-115.96,2,1,0,53,0,1,0,1,1,1,0,0.00,0.00,0.14,5,0,5553,865.25,0,1,0,0,0,0,0,0,1,1,0,0,0
7034,0,57,0,0,33.14,-116.97,0,1,0,0,0,0,0,0,0,0,1,0.00,0.00,0.35,3,0,5191,39.25,0,0,0,1,0,0,0,1,0,0,0,1,0
7035,1,63,0,0,33.04,-115.61,0,1,0,2,0,0,0,0,0,1,1,0.01,0.00,0.51,3,0,4591,875.08,0,0,1,0,0,0,0,0,1,0,1,0,0
7036,1,57,0,0,32.85,-114.85,0,1,0,13,0,0,0,0,1,1,1,0.02,0.00,0.38,3,0,2464,498.37,0,0,0,0,0,0,0,0,1,0,1,0,0


# Catboost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('churn', axis=1), df['churn'], test_size=0.20)

In [None]:
model = catboost.CatBoostClassifier(loss_function='Logloss', random_seed=888)

# train the model
model.fit(X_train, y_train)

In [None]:
print(sklearn.metrics.classification_report(y_test, model.predict(X_test), digits=3))

              precision    recall  f1-score   support

           0      0.967     0.985     0.976      1031
           1      0.958     0.907     0.932       378

    accuracy                          0.965      1409
   macro avg      0.962     0.946     0.954      1409
weighted avg      0.964     0.965     0.964      1409



# NN

In [8]:
def get_data_loaders(df, target, val_rate=.2, batch_size=64):
    X_train, X_test, y_train, y_val = train_test_split(df.drop(target, axis=1), df[target], test_size=val_rate)

    training_set = torch.utils.data.TensorDataset(
        torch.tensor(X_train.values, dtype=torch.float32),
        torch.tensor(y_train.values, dtype=torch.float32))

    validation_set = torch.utils.data.TensorDataset(
        torch.tensor(X_test.values, dtype=torch.float32),
        torch.tensor(y_val.values, dtype=torch.float32))

    training_loader = torch.utils.data.DataLoader(training_set, batch_size=batch_size, shuffle=True)
    validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=True) # here shuffle is recommended for using early stopping on big batches later
    return training_loader, validation_loader

training_loader, validation_loader = get_data_loaders(df, 'churn')

In [9]:
# class Trainer():
#     def __init__():
#         return None


#     def train(model,
#             loss_fn,
#             optimizer,
#             training_loader,
#             validation_loader,
#             *,
#             bbatch_size : int | None = 16,
#             print_loss : str | None = None,
#             graph_loss : str | None = 'bbatch',
#             early_stopping : str | None = 'bbatch',
#             early_stopping_params : dict | None = None,
#             max_epochs : int = 100) -> None:
#         '''
#         Optional args:
#             bbatch_size: size of a big batch (in usual batches, defined by training loader) (16)

#             print_loss: print loss (None) ['epoch', 'bbatch', None]
#             graph_loss: graph loss ('bbatch') ['epoch', 'bbatch', None]
#                 Not recommended to use both

#             early_stopping: early stopping done on epochs, big batches or not implemented ('bbatch') ['epoch', 'bbatch', None]
#                 It is recommended that early stopping matches loss reporting (big batches/epochs)
#             early_stopping_params: dict
#                 X
#                 path: path to save the model ('models/model' is default and recommended)
#                 save: save mode for early stopping ('stop') ['new', 'overwrite', 'stop']
#                     'new' - save new model each time improvement is noticed (not recommended),
#                     'overwrite' - overwrite model each time improvement is noticed (recommended for smaller NNs)
#                     'stop' - do not save at all, simply break training if early stop is triggered (recommended for larger NNs)

#             max_epochs: maximum number of epochs to run (100). If reached it will print to stdout, suggesting that the training should be run again.
#         '''
#         #
#         # option validation
#         #


#         #
#         # option processing
#         #
#         if early_stopping_params is None:
#             early_stopping_params = {}

#         early_stopping_params.setdefault('path', 'models/model')
#         early_stopping_params.setdefault('save', 'stop')

#         needs_epoch_val_loss = (print_loss == 'epoch') or (graph_loss == 'epoch')

#         # option processing for the _train_epoch()
#         # all the things like early stopping is done per big batch inside _train_epoch()
#         # thats why params below are named 'bbatch_...'
#         bbatch_print_loss = print_loss == 'bbatch'
#         bbatch_graph_loss = graph_loss == 'bbatch'
#         bbatch_early_stopping = early_stopping == 'bbatch'

#         #
#         # start
#         #

#         min_val_loss = 1e9
#         for epoch in range(max_epochs):

#             train_loss = _train_epoch(model, loss_fn, optimizer, training_loader, validation_loader,
#                                         bbatch_size = bbatch_size,
#                                         print_loss = bbatch_print_loss,
#                                         graph_loss = bbatch_graph_loss,
#                                         early_stopping = bbatch_early_stopping,
#                                         early_stopping_params = early_stopping_params)

#             if needs_epoch_val_loss:
#                 pass # val_loss = _validate_epoch(model, validation_loader)

#             if print_loss == 'epoch':
#                 pass

#             if graph_loss == 'epoch':
#                 pass

#         return None


#     def _train_epoch(model,
#                     loss_fn,
#                     optimizer,
#                     training_loader,
#                     validation_loader,
#                     *,
#                     bbatch_size : int | None = 16,
#                     print_loss : bool,
#                     graph_loss : bool,
#                     early_stopping : bool,
#                     early_stopping_params : dict | None = None) -> float:

#         epoch_loss = 0.
#         prev_value_epoch_loss = 0. # for training big batch loss reporting

#         for batch_index, (inputs, labels) in enumerate(training_loader):

#             # training stuff
#             optimizer.zero_grad()

#             outputs = model(inputs)
#             loss = loss_fn(outputs, labels.unsqueeze(1))

#             loss.backward()
#             optimizer.step()

#             # sum loss for reporting
#             epoch_loss += loss.item()

#             if batch_index % bbatch_size == bbatch_size - 1:

#                 # we use this to extract only the loss attributed to this big batch
#                 # we do not track big batch explicitly, because we do that for epoch loss
#                 training_big_batch_loss = epoch_loss - prev_value_epoch_loss
#                 prev_value_epoch_loss = copy.deepcopy(epoch_loss)

#                 validation_big_batch_loss = ...

#                 # early stopping

#                 # plot training_big_batch_loss and validation_big_batch_loss

#         epoch_loss = epoch_loss / (batch_index + 1)
#         return epoch_loss

In [10]:
class NeuralNetwork(torch.nn.Module):
    def __init__(self, n_features_in : int):
        super().__init__()
        self.linear_relu_stack = torch.nn.Sequential(
            torch.nn.Linear(n_features_in, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x

model = NeuralNetwork(df.shape[1]-1)

loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)

In [11]:
class Trainer():
    def __init__(self,
                 model,
                 loss_fn,
                 optimizer,
                 training_loader,
                 validation_loader,
                 *,
                 bbatch_size : int | None = 16,
                 val_bbatch_size : int | None = 16,
                 print_loss : str | None = None,
                 graph_loss : str | None = 'bbatch',
                 early_stopping : str | None = 'bbatch',
                 early_stopping_params : dict | None = None,
                 max_epochs : int = 100) -> None:
        '''
        Loss tracking is done by storing the losses as pandas dataframe with columns:
            epoch
            big train batch index
            big train batch loss
            big val batch loss

        Loss tracking for user is done by 1) printing 2) graphing 3) Nothing.

        Loss tracking is done either on big batches or epochs.

        Early stopping is done either on big batches or epochs.

        Optional args:
            bbatch_size: size of a big batch (in usual batches, defined by training loader) (16)
            val_bbatch_size: same for validation. Losses for big val batches will be calculated after a big train batch has run

            print_loss: print loss (None) ['epoch', 'bbatch', None]
            graph_loss: graph loss ('bbatch') ['epoch', 'bbatch', None]
                Not recommended to use both

            early_stopping: early stopping done on epochs, big batches or not implemented ('bbatch') ['epoch', 'bbatch', None]
                It is recommended that early stopping matches loss reporting (big batches/epochs)
            early_stopping_params: dict
                X
                path: path to save the model ('models/model' is default and recommended)
                save: save mode for early stopping ('stop') ['new', 'overwrite', 'stop']
                    'new' - save new model each time improvement is noticed (not recommended),
                    'overwrite' - overwrite model each time improvement is noticed (recommended for smaller NNs)
                    'stop' - do not save at all, simply break training if early stop is triggered (recommended for larger NNs)

            max_epochs: maximum number of epochs to run (100)
        '''
        #
        # option validation
        #


        #
        # option processing
        #
        if early_stopping_params is None:
            early_stopping_params = {}

        early_stopping_params.setdefault('path', 'models/model')
        early_stopping_params.setdefault('save', 'stop')

        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.training_loader = training_loader
        self.validation_loader = validation_loader
        self.bbatch_size = bbatch_size
        self.val_bbatch_size = val_bbatch_size
        self.print_loss = print_loss
        self.graph_loss = graph_loss
        self.early_stopping = early_stopping
        self.early_stopping_params = early_stopping_params
        self.max_epochs = max_epochs

        self.cur_epoch = None
        self.train_epoch_losses = []
        self.val_epoch_losses = []
        self.train_bbatch_losses = []
        self.val_bbatch_losses = []
        self.epoch_loss_df = pd.DataFrame()
        self.bbatch_loss_df = pd.DataFrame()
        return None


    def train(self) -> None:
        for epoch in range(self.max_epochs):
            self.cur_epoch = epoch

            if self.print_loss:
                # print epoch title
                pass

            train_loss = self._train_epoch()
            self.train_epoch_losses.append(train_loss)
            print(train_loss)
            if self.print_loss == 'epoch' or self.graph_loss == 'epoch':
                pass # val_loss = _validate_epoch()

            if self.print_loss == 'epoch':
                # print epoch train and val losses 
                pass

            if self.graph_loss == 'epoch':
                pass
            
            # if early stop has activated: stop training and print it out

        # if all epochs have run, print that
        return None


    def _train_epoch(self) -> float:
        '''
        Train for epoch, return mean training loss
        '''
        epoch_loss = 0.
        prev_value_epoch_loss = 0. # for training big batch loss reporting

        for batch_index, (inputs, labels) in enumerate(self.training_loader):

            # training stuff
            self.model.train()
            self.optimizer.zero_grad()

            outputs = self.model(inputs)
            
            loss = self.loss_fn(outputs, labels.unsqueeze(1))
            loss.backward()
            self.optimizer.step()

            # sum loss for reporting
            epoch_loss += loss.item()

            if self.bbatch_size is not None and batch_index % self.bbatch_size == self.bbatch_size - 1:

                # we use this to extract only the loss attributed to this big batch
                # we do not track big batch explicitly, because we do that for epoch loss
                training_bbatch_loss = (epoch_loss - prev_value_epoch_loss) / self.bbatch_size
                prev_value_epoch_loss = copy.deepcopy(epoch_loss)
                self.train_bbatch_losses.append(training_bbatch_loss)

                val_bbatch_loss = self._get_val_loss(on='bbatch')
                self.val_bbatch_losses.append(val_bbatch_loss)

                # early stopping

                # plot training_big_batch_loss and validation_big_batch_loss

        epoch_loss = epoch_loss / (batch_index + 1)
        return epoch_loss


    def _get_val_loss(self, on : str) -> float:
        '''
        Function sets the modes (eval, train) itself
        '''
        val_loss = 0.
        self.model.eval()

        with torch.no_grad():
            for batch_index, (inputs, labels) in enumerate(self.validation_loader):
                outputs = self.model(inputs)
                loss = self.loss_fn(outputs, labels.unsqueeze(1))
                val_loss += loss.item()

                if on == 'bbatch' and batch_index % self.val_bbatch_size == self.val_bbatch_size - 1:
                    break
        
        val_loss = val_loss / (batch_index + 1)
        return val_loss


Trainer(model, loss_fn, optimizer, training_loader, validation_loader,
        bbatch_size=32, val_bbatch_size=32, max_epochs=20).train()

45.44264409247409
1.7010993187347154
0.5583288880546441
0.5221364672264356
0.5192134236351827
0.507265346606126
0.5030971264906143
0.5086156056168374
0.5054331324743421
0.5055928290560004
0.5013445826728692
0.4903987883851769
0.4814265225543065
0.4967635247144806
0.4904876954100105
0.47804487588700284
0.47830919737226507
0.4715887324863605
0.4798377594921026
0.46283281468943266
