In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


import optuna
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

## Exploratory Data Analysis

Let's start by reading the data and perform some basix exploration using some descriptive statistics.

In [None]:
train = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
train.head()

In [None]:
print('Number of rows: {}, Number of columns = {}'.format(train.shape[0], train.shape[1]))

In [None]:
train.MonthlyCharges.value_counts()[train.MonthlyCharges.value_counts().index < 19]

Let's explore the data types of the columsn to make sure that every column has its reasonable and expected data type, For instance, we don't wan't datetyme columns to have a string data type or count to be a string.

In [None]:
train.info()

In [None]:
type(train.TotalCharges[0])
train.TotalCharges = train.TotalCharges.replace(' ', None)
train.TotalCharges = train.TotalCharges.apply(lambda x: float(eval(x)))
train.SeniorCitizen.value_counts()

In [None]:
train.tenure.value_counts()

In [None]:
train.isna().sum()

### Outliers Detection

There are many methods that can be used to detect outliers in a dataset. In this workshop we will discuss the following:
* Box Plot method
* Standarization (Z-sore) method

##### Box Plot :: Consists of five main components:
* Q1, first quartile (Midean of the first half of the data)
* Q2, Midean of the data
* Q3, midean of the second half of the data
* Max value
* Min value

##### Main equations in box plots:
$$ IQR = Q3 - Q1 $$
$$ Outliers = Q3 + 1.5 * IQR$$
$$ Q1 - 1.5 * IQR $$

##### Z-score method
Z-score represents the number of standard deviations removed from the mean for each data point. In a simpler way, it is the distance for a point from the mean in standard deviations.
$$ z-score = {x - mean \over std} $$

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (20,10))
ax = fig.gca()
sns.boxplot(data= train['TotalCharges'], orient="h", palette="Set1", ax = ax)

In [None]:
fig = plt.figure(figsize = (20,10))
ax = fig.gca()
sns.boxplot(data= train[['tenure', 'MonthlyCharges']], orient="h", palette="Set1", ax = ax)

In [None]:
from scipy import stats

rows = np.any(stats.zscore(train[['tenure', 'MonthlyCharges', 'TotalCharges']].values) > 2.5, axis=1)
outliers = train.loc[rows]
outliers.shape

In [None]:
sns.pairplot(train)

In [None]:
train.tenure.hist()

In [None]:
train.TotalCharges.hist(figsize = (20,20), bins = 1000)

### Categorical Variables
The column consists of two categories only, Y and N. Let's explore further if we can order those or just one-hot encode them.

#### Note:
One hot encoding a feature adds new features for each unique category, so if you have only two catogries "Y" and "N" in Churn, you will have two new columns Y and N where Y feature will have 1s in the places diagnosis = "Y" and N feature will have 1's in the places diagnosis = "N"

#### Example

One-hot encoding:

diagnosis &nbsp;&nbsp;&nbsp; Y | N <br>
Y &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         1 | 0 <br>
N &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         0 | 1 <br>
N &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         0 | 1 <br>
Y &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         1 | 0 <br>
Y &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;         1 | 0 <br>

Label encoding: if Y is ranked lower than N: <br>
diagnosis  &nbsp;    diagnosis_new <br>
Y  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            1 <br>
N  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            2 <br>
N  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            2 <br>
Y  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            1 <br>
Y  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;            1 <br>


But how can we really know if the data is ranked or not? This can be done using dommain knowledge of the data, for example most of the features are described in the main dataset page, You can also determine this in real life problems using your own knowledge of the problem and the data collected.
Here in this data it is obvious that Y means the customer left and N means still a customer, also note that Churn is the target variable so I will go for label encoding the variable to get one output for each row.

In [None]:
from sklearn import preprocessing

label_enc = preprocessing.LabelEncoder()
train.Churn = label_enc.fit_transform(train.Churn)
labels = train.Churn

In [None]:
label_enc.classes_

In [None]:
train[['Female', 'Male']] = pd.get_dummies(train.gender)

In [None]:
train[['part_n', 'part_y']] = pd.get_dummies(train.Partner)
train[['dep_n', 'dep_y']] = pd.get_dummies(train.Dependents)
train[['phone_n', 'phone_y']] = pd.get_dummies(train.PhoneService)
train[['senior', 'not-senior']] = pd.get_dummies(train.SeniorCitizen)
train[['one_line', 'no_line', 'multi-line']] = pd.get_dummies(train.MultipleLines)
train[['bt' ,'cc', 'ec', 'mc']] = pd.get_dummies(train.PaymentMethod)
train[['mm' ,'oy', 'ty']] = pd.get_dummies(train.Contract)

In [None]:
train

In [None]:
NUM_COLS = list(train.dtypes[train.dtypes != 'object'].index)

In [None]:
NUM_COLS

In [None]:
train_new = train[NUM_COLS].drop(['Churn'], axis = 1)

In [None]:
train_new

### Feature Selection

There are two main methods for feature selection:
* Statistical based Feature Selection
    * Correlation coffecients: correlation between the features and the target
    * Hypothesis testing with the alternative hypothesis being the feature is segnificant to the target variable
* Model Based Feature Selection

First we will split the data into training and testing then check the correlation using correlation Matrix.

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split( train_new.values, labels.values, test_size = 0.2, random_state=42 )

## Correlation based feature selection

In [None]:
import seaborn as sns


import matplotlib.pyplot as plt


corr = train.corr()
f, ax = plt.subplots(figsize=(25, 25))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, cmap=cmap, vmax=1, vmin = -1, center=0,
            square=True, linewidths=.5)

## Model Based Selection

In [None]:
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
# load the iris datasets
dataset = datasets.load_iris()
# fit an Extra Trees model to the data
clf = ExtraTreesClassifier()
clf.fit(x_train,y_train)
# display the relative importance of each attribute
z = clf.feature_importances_
#make a dataframe to display every value and its column name
df = pd.DataFrame()
print(len(z))
print(len(list(train_new.columns.values)))

df["values"] = z
df['column'] = list(train_new.columns.values)
# Sort then descendingly to get the worst features at the end
df.sort_values(by='values', ascending=False, inplace = True)
df.head(100)

In [None]:
train.gender.value_counts()

In [None]:
train[['Churn', 'gender']].groupby('gender').sum().plot(kind = 'bar')

## Hypothesis testing based

In [None]:
# SelectKBest selects features according to the k highest scores of a given scoring function 
from sklearn.feature_selection import SelectKBest # This models a statistical test known as ANOVA 
from sklearn.feature_selection import f_classif

k_best = SelectKBest(f_classif, k = 10)
k_best.fit_transform( x_train, y_train)

In [None]:
k_best.pvalues_ 
p_values = pd.DataFrame({'column': train_new.columns, 'p_value': k_best.pvalues_})
p_values.sort_values('p_value')
p_values

From the hypothesis testing, the correlations and the model based feature selection we can conclude that the gender feature doesn't have a segnificant effect of the Churn feature. And it is also obvious that the Teneor and MonthlyCharges are the most important features from our analysis based on the above analysis.

# Model Selection and Optmization using Optuna

In [None]:
from mlxtend.classifier import StackingCVClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import  GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
from sklearn import metrics
from sklearn import ensemble,model_selection,svm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

c1 = ExtraTreesClassifier(n_estimators=700,bootstrap=True) 
meta2 = ExtraTreesClassifier(n_estimators=200,bootstrap=True) 

c2 = RandomForestClassifier(n_estimators=500,bootstrap=True)
c3 = XGBClassifier()
c4 = svm.LinearSVC()
c5 = GradientBoostingClassifier()
c6 = AdaBoostClassifier()
meta = LogisticRegression()

etc = StackingCVClassifier(classifiers=[c1, c2, c3, meta, c5],use_probas=True,meta_classifier=meta2)

etc.fit(x_train, y_train)

In [None]:
print('Accuracy of classifier on training set: {:.2f}'.format(etc.score(x_train, y_train) * 100))
print('Accuracy of classifier on test set: {:.2f}'.format(etc.score(x_test, y_test) * 100))

In [None]:
from xgboost import XGBClassifier

def objective(trial,data=train_new.values,target=labels.values):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 4000, 100),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
    }
    
    model = XGBClassifier(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(test_x)
    acc = accuracy_score(test_y, preds)
    return acc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
param = {'lambda': 0.019097303955226335, 'alpha': 6.255501364107075, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.018, 'n_estimators': 3000, 'max_depth': 5, 'random_state': 24}

In [None]:
clf = XGBClassifier(**param)

clf.fit(x_train, y_train)

print('Accuracy of classifier on training set: {:.2f}'.format(clf.score(x_train, y_train) * 100))
print('Accuracy of classifier on test set: {:.2f}'.format(clf.score(x_test, y_test) * 100))