# Setup

In [None]:
# Potential additions
# Column combinations / modifications
# Grid search

In [None]:
import os
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import issparse
from sklearn.metrics import silhouette_score

In [None]:
np.random.seed(42)
sk.set_config(display='diagram')

In [None]:
# Kaggle dirs: /kaggle/input/, /kaggle/working/ and /kaggle/temp/
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) # listing all inputs

train = pd.read_csv('/kaggle/input/datasets-for-churn-telecom/cell2celltrain.csv')
train = train.drop("CustomerID", axis=1)

# General Outlook
(Optional)

In [None]:
train

In [None]:
train.describe()

In [None]:
train.columns

In [None]:
train.corr()["MonthlyRevenue"].sort_values(ascending=False)

# Data Division

In [None]:
train_num = train.select_dtypes(np.number)
train_cat = train.select_dtypes(np.object)
num_attributes = train_num.columns
cat_attributes = train_cat.columns

with_missing = train.isnull().sum() > 0
missing_data = train[with_missing[with_missing].index]
missing_num, missing_cat = missing_data.select_dtypes(np.number), missing_data.select_dtypes(exclude=np.number)

# Categorical variable analysis]
(Optional)

In [None]:
train_cat.iloc[:,8:]
# ordinal - CreditRating
# onehot - Occupation, PrizmCode, ServiceArea, MaritalStatus
# the rest is boolean, using Ordinal (with a caveat described in the cell below)

In [None]:
# handling Unknown (aka extra NA)
train_cat[train_cat[train_cat == "Unknown"].dropna(how='all', axis=1).columns].nunique()
# Unknown in Homeownership is boolean (Known/Unkown - 1/0)
# in MaritalStatus it is N/A in boolean (with other values being Yes/No)
# in HandsetPrice, it is actual N/A
# a customer transformer needed
# boolean goes with ordinal encoder
# check if order matters

In [None]:
n_unique_cat = train_cat.nunique()
non_boolean_cat = n_unique_cat > 2
n_unique_cat

In [None]:
train_cat[non_boolean_cat[non_boolean_cat].index]
# train_cat.iloc[:, np.flatnonzero(train_cat.nunique() > 2)]

# Prep-Pipelining

In [None]:
class NAValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self, val_to_fill="Unknown"):
        self.val_to_fill = val_to_fill
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        return X.replace(np.nan, self.val_to_fill)

class ToNumeric(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        tf = X.apply(pd.to_numeric, errors='coerce')
        return tf

class Debug(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, **params):
        return self

    def transform(self, X):
        print(pd.DataFrame(X).head())
        print(X.shape)
        return X

class Densifier(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if issparse(X):
            return X.todense()
        return X

In [None]:
# missing_cat - ServiceArea
cat_filling_pipeline = ColumnTransformer([
    ("navf", NAValueFiller(), missing_cat.columns)
], remainder='passthrough')
to_one_hot_attribs = ["Occupation", "PrizmCode", "ServiceArea", "MaritalStatus"]
to_one_hot, to_ordinal = train_cat[to_one_hot_attribs], train_cat.drop(to_one_hot_attribs + ['HandsetPrice'], axis=1)
one_hot_pipeline = Pipeline([
    ("filling", cat_filling_pipeline),
    ('encoder', OneHotEncoder()),
#     ("dbg", Debug()),
])
ordinal_pipeline = Pipeline([
    ('encoder', OrdinalEncoder()),
#     ("dbg", Debug()),
])

In [None]:
# Note:
# ColumnTransformer will split a given step into *n* funnels, whereas Pipeline just applies steps sequentially

In [None]:
to_nan_impute = [train_cat.HandsetPrice.name] + missing_num.columns.tolist()
num_attribtues = train_num.columns.tolist() + [train.HandsetPrice.name]

handset_pipeline = ColumnTransformer([
    ('to_numeric', ToNumeric(), [train.HandsetPrice.name])
], remainder='passthrough')
nan_impute_pipeline = Pipeline([
    ("handset", handset_pipeline),
    ('imputer', SimpleImputer(strategy="median"))
])

num_prep_pipeline = ColumnTransformer([
    ("inpute_pipeline", nan_impute_pipeline, to_nan_impute),
], remainder='passthrough')
num_pipeline = Pipeline([
    ("num_prep", num_prep_pipeline),
#     ("dbg", Debug()),
    ('scaler', StandardScaler()),
])

In [None]:
prep_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribtues),
    ("one_hot_cat", one_hot_pipeline, to_one_hot_attribs),
    ("ordinal_cat", ordinal_pipeline, to_ordinal.columns),
])

In [None]:
prep_pipeline

In [None]:
prepared = prep_pipeline.fit_transform(train)

# PCA + Model (KMeans) - Full Pipeline

In [None]:
full_pipeline = Pipeline([
        ("preparation", prep_pipeline),
        ("densify_for_pca", Densifier()),
        ("pca", PCA(n_components=0.9)),
        ("model", KMeans())
    ])

In [None]:
silhouette_scores = {}
for k in range(2,10):
    full_pipeline.set_params(model__n_clusters=k)
    full_pipeline.fit_predict(train)
    labels = full_pipeline.named_steps["model"].labels_
    silhouette_scores[k] = silhouette_score(prepared, labels, sample_size=2000)

In [None]:
full_pipeline.set_params(model__n_clusters=max(silhouette_scores, key=silhouette_scores.get))

In [None]:
result = full_pipeline.fit_predict(train)

In [None]:
# explained var ~90%
full_pipeline.named_steps["pca"].explained_variance_ratio_.sum()

In [None]:
print("% of customers per cluster")
pd.Series(result).value_counts() / len(result)

# Profiling

In [None]:
# getting column names
# ordering num names according to the pipeline diagram (1. HandsetPrice, 2. to_nan_impute, 3. the rest)
removables = to_nan_impute.copy()
removables.remove('HandsetPrice')
num_names = np.concatenate([to_nan_impute, train_num.columns.drop(removables)])
one_hot_names = full_pipeline.named_steps['preparation'].transformers_[1][1]['encoder'].get_feature_names()
ordinal_cat_names = full_pipeline.named_steps['preparation'].transformers_[2][2]
ordered_names = np.concatenate([num_names, one_hot_names, ordinal_cat_names])

In [None]:
prepared_frame = pd.DataFrame(prepared.todense(), columns=ordered_names)
prepared_frame["Segment"] = result

In [None]:
# unscale numericals
prepared_frame.iloc[:, :len(num_names)] = full_pipeline.named_steps['preparation'].transformers_[0][1]['scaler'].inverse_transform(prepared_frame[num_names])

In [None]:
prepared_frame.groupby("Segment").mean()

---