In [1]:
from lumpia.load_data.load_data import read_data
from lumpia.model.model import train_model
from lumpia.pre_processing.pre_processing import drop_nan, fill_mean
from lumpia.train_test.train_test import split
from lumpia.evaluation.evaluation import get_roc_auc_score
from lumpia.features.features import gen_dummies, normalize, take_log
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Pre-processing: Data cleansing for proper usage

churn_raw = read_data('churn_data.csv')
churn_df = drop_nan(churn_raw, ['DataUsage','ContractRenewal'])

In [3]:
#Features: Creation of features depending on the database's information

churn_df = gen_dummies(churn_df, ['DataPlan','ContractRenewal'])

cols_to_norm = ['AccountWeeks','DataUsage','CustServCalls','DayMins','DayCalls','MonthlyCharge','RoamMins']
norm_df = normalize(churn_df, cols_to_norm)

norm_df = take_log(norm_df, ['OverageFee'])


In [4]:
print(norm_df.columns)

Index(['Churn', 'AccountWeeks', 'DataUsage', 'CustServCalls', 'DayMins',
       'DayCalls', 'MonthlyCharge', 'OverageFee', 'RoamMins', 'DataPlan_1',
       'ContractRenewal_1'],
      dtype='object')


In [5]:
#Split: Partition the dataset for training and testing a set model
X_train, X_test, y_train, y_test = split(norm_df)

In [6]:
#Trainer: Definition of a model from the dataset to make predictions

target_cols = ['Churn']
feature_cols = list(norm_df.columns).remove('Churn')

results = train_model(X_train, X_test, y_train, y_test, feature_cols, target_cols)

In [8]:
#Assessment: Evaluation of the model's performance with a given accuracy computation

train_auc, test_auc = get_roc_auc_score(results)
print(train_auc, test_auc)

0.11437840162536517 0.22477515628150835
