In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
from scripts.automl import automl
from sklearn.model_selection import StratifiedKFold

# Read Data

In [2]:
churn_data = arff.loadarff("data/chrun.arff")
churn_df = pd.DataFrame(churn_data[0])

Small pre-processing. The columns **class** and **number_customer_service_calls** can be made ints

In [3]:
churn_df["class"] = churn_df["class"].astype(int)
churn_df["number_customer_service_calls"] = churn_df["number_customer_service_calls"].astype(int)

# Prepare for automl

In [4]:
X_complete = churn_df.drop(columns="class")
y_complete = np.reshape(churn_df[["class"]].values, X_complete.shape[0])

# Apply AutoML

In [5]:
#complete_archive, best_pipeline, final_metrics = automl(X_complete, y_complete, 
#                                                        classification_algorithms=["EL", "RF", "GB"],
#                                                        numerical_strategies=["SSE", "MMS"],
#                                                        test_fraction=0.3, cv_folds=5, n_jobs=50)

# K-fold experiments

In [6]:
import tqdm
from sklearn.metrics import SCORERS
from sklearn.linear_model import LogisticRegression
from scripts.preprocessing_functions import basic_processing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

In [7]:
skf = StratifiedKFold(n_splits=10)

In [8]:
i=0
automl_archive = pd.DataFrame()
kfold_archive = pd.DataFrame()
scorer = "neg_log_loss"
ncores=50
rand_state=42

for train_index, test_index in skf.split(X_complete, y_complete):
    print(f"FOLD {i}")
    
    # Splitting the data
    X_train = basic_processing(X_complete.iloc[train_index,:])
    y_train = y_complete[train_index]
    
    X_test = basic_processing(X_complete.iloc[test_index, :])
    y_test = y_complete[test_index]
    
    # Training
    ## AutoML
    complete_archive, best_pipeline, final_metrics = automl(X_train, y_train, 
                                                            classification_algorithms=["EL", "RF", "GB"],
                                                            numerical_strategies=["SSE", "MMS"],
                                                            test_fraction=0.1, cv_folds=4, 
                                                            random_state=rand_state,
                                                            num_iterations=4,
                                                            n_jobs=ncores)
    
    ## Random Forrest
    rf = RandomForestClassifier(random_state=rand_state, n_jobs=ncores)
    rf.fit(X_train, y_train)
    
    ## Majority voting
    mv_rf = RandomForestClassifier(random_state=rand_state, n_estimators=50)
    mv_lr = LogisticRegression(solver="saga", penalty="elasticnet", l1_ratio=0.5, C=0.5)
    mv_gb = GradientBoostingClassifier(n_estimators=50, random_state=rand_state)
    
    mv = VotingClassifier(estimators=[("rf", mv_rf), ("lr", mv_lr), ("gb", mv_gb)],
                          voting='soft', n_jobs=ncores)
    mv.fit(X_train, y_train)
    
    
    # Testing
    mv_score = SCORERS[scorer](mv, X_test, y_test)
    rf_score = SCORERS[scorer](rf, X_test, y_test)
    am_score = SCORERS[scorer](best_pipeline, X_test, y_test)
    
    k_performance = pd.DataFrame({"model": ["AutoML", "RF", "MV"],
                                  "validation": [mv_score, rf_score, am_score],
                                  "kfold":[i, i, i]})
    
    # Add Performance to dataframe
    kfold_archive = pd.concat([kfold_archive, k_performance])
    
    # Keep info about kfold for automl
    complete_archive["kfold"] = i
    automl_archive = pd.concat([automl_archive, complete_archive])
    
    i += 1

FOLD 0


18it [01:26,  4.82s/it]


FOLD 1


18it [01:27,  4.84s/it]


FOLD 2


18it [01:25,  4.74s/it]


FOLD 3


18it [01:30,  5.05s/it]


FOLD 4


18it [01:25,  4.77s/it]


FOLD 5


18it [01:25,  4.76s/it]


FOLD 6


18it [01:28,  4.90s/it]


FOLD 7


18it [01:27,  4.85s/it]


FOLD 8


18it [01:29,  5.00s/it]


FOLD 9


18it [01:26,  4.79s/it]
