In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import xgboost
import graphviz

RANDOM_SEED = 1212

## Data Loading

In [None]:
train = pd.read_csv('../data/small/train/orange_small_train.data', sep="\t")
train.head()

In [None]:
churn_label = (
    pd.read_csv(
        "../data/small/labels/orange_small_train_churn.labels", sep="\t", header=None
    )
    .iloc[:, 0]
    .astype("category")
)

churn_label.cat.rename_categories([False, True], inplace=True)

churn_label.head()

## Dataframe Conversion

In [None]:
def convert_dataframe(DF):
    df_var_names = DF.columns

    df_types = {df_var_name: DF[df_var_name].dtype for df_var_name in df_var_names}

    for df_var_name in df_var_names:
        if df_types[df_var_name] == int:
            df = DF[df_var_name].astype(float)
            DF.loc[:, df_var_name] = df
            df_types[df_var_name] = df.dtype

        elif df_types[df_var_name] != float:
            df = DF[df_var_name].astype("category")
            DF.loc[:, df_var_name] = df
            df_types[df_var_name] = df.dtype

    return DF, df_types

In [None]:
train, train_types = convert_dataframe(train)

## Train and Test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    train, churn_label, test_size=0.2, random_state=RANDOM_SEED
)

In [None]:
print(x_train.shape)
print(x_test.shape)

## Data Cleaning and Filling

In [None]:
def plot_missing_data(x_train, x_test):
    x_train_missing = x_train.isnull().sum() / x_train.shape[0]
    x_test_missing = x_test.isnull().sum() / x_test.shape[0]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharex="all", sharey="all")
    ax1.hist(x_train_missing)
    ax1.set_title("Train data")
    ax2.hist(x_test_missing)
    ax2.set_title("Test data")

    fig.suptitle("Missing data proportions")
    plt.show()

In [None]:
plot_missing_data(x_train, x_test)

I'll consider valid only features that have less than 15% missing data of its total data as our model features

In [None]:
x_train_missing = x_train.isnull().sum() / x_train.shape[0]
x_features = x_train_missing[x_train_missing <= 0.15].index

print(x_features)
print(len(x_features))

Filling the missing numeric values with respective column mean values

In [None]:
numeric_x_features = [feat for feat in x_features if train_types[feat] == float]

print(numeric_x_features)
print(len(numeric_x_features))

In [None]:
x_train[numeric_x_features] = x_train[numeric_x_features].fillna(
    x_train[numeric_x_features].mean()
)

In [None]:
plot_missing_data(x_train, x_test)

I'll now remove the categorical features that have more than 400 categories in it

In [None]:
categorical_x_features = [feat for feat in x_features if train_types[feat] != float]

print(categorical_x_features)
print(len(categorical_x_features))

In [None]:
categorical_levels = x_train[categorical_x_features].apply(lambda col: len(col.cat.categories))

categorical_x_features_filtered = categorical_levels[categorical_levels <= 400].index.tolist()
print(categorical_x_features_filtered)
print(len(categorical_x_features_filtered))

In [None]:
x_features_filtered = numeric_x_features + categorical_x_features_filtered
print(x_features_filtered)
print(len(x_features_filtered))

In [None]:
x_train = x_train[x_features_filtered]
x_train = pd.get_dummies(x_train, dtype=bool)
x_train.head()

## Model Training

In [None]:
# train_data = xgboost.DMatrix(data=pd.get_dummies(x_train, dtype=bool), label=y_train)

In [None]:
xgb = xgboost.XGBClassifier(
    n_jobs=8,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=3000,
    random_state=RANDOM_SEED,
)

In [None]:
xgb.fit(x_train, y_train)