# Functions

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from random import sample
from sklearn.utils import resample
from imblearn import under_sampling,over_sampling
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,learning_curve,StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.dummy import DummyClassifier
from statsmodels.stats import stattools
import statsmodels.graphics.tsaplots as smgt
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, classification_report, precision_recall_curve, average_precision_score, roc_auc_score
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_categoric_feature(feature_name):
    dftemp = dfdata.groupby([feature_name,'churn'],as_index = False).agg({'customerid':'count','monthlycharges':'mean'})
    dftemp['feature_rate'] = dftemp.apply(lambda row: row['customerid'] / dftemp[dftemp[feature_name] == row[feature_name]]['customerid'].sum() ,axis = 1)
    dftemp['rate'] = dftemp['customerid'] / dftemp['customerid'].sum()
    fig,ax = plt.subplots(1,3,figsize=(20,3))
    sns.barplot(y=feature_name, x = 'feature_rate', color = 'darkorange',data = dftemp[dftemp['churn'] == 'Yes'],ax = ax[0])
    ax[0].set(title = 'churn rate in feature ' + feature_name, xlabel = '')
    sns.barplot(y=feature_name, x = 'rate', hue='churn',data = dftemp,ax = ax[1])
    ax[1].set(title = feature_name + ' rate in whole dataset',xlabel = '')
    sns.barplot(y=feature_name, x = 'monthlycharges', hue='churn',data = dftemp,ax = ax[2])
    ax[2].set(title = feature_name + ' monthlycharges',xlabel = '')
    plt.show()

In [None]:
def plot_learning_curve(estimator, X_train, y_train):
    kfold = StratifiedKFold(n_splits = 5)
    train_size,train_scores,test_scores = learning_curve(estimator,X_train,y_train,train_sizes = np.linspace(0.05,1,20),cv = kfold)
    train_scores_mean = np.mean(train_scores,axis=1)
    train_scores_std = np.std(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores,axis=1)
    test_scores_std = np.std(test_scores,axis=1)

    sns.lineplot(x=train_size, y=train_scores_mean, c='r', label='train')
    plt.fill_between(x=train_size, y1=train_scores_mean+train_scores_std, y2=train_scores_mean-train_scores_std, alpha=0.1, color='r')
    sns.lineplot(x=train_size,y=test_scores_mean,c='b',label='test')
    plt.fill_between(x=train_size, y1=test_scores_mean+test_scores_std, y2=test_scores_mean-test_scores_std, alpha=0.1, color='b')
    plt.legend(loc='best')
    plt.title("Learning Curve")

In [None]:
def plot_confusion_matrix(estimator,y,y_pred):
    cm = confusion_matrix(y, y_pred, labels = [0,1] )
    sns.heatmap(cm, annot=True,  fmt='.0f', xticklabels = ["No", "Yes"] , yticklabels = ["No", "Yes"],cbar = False)
    plt.title("Confusion Matrix")
    plt.ylabel('Actual')
    plt.xlabel('Prediction')

In [None]:
def plot_empty_confusion_matrix():
    plt.text(0.45, .6, "TN", size=100, horizontalalignment='right')
    plt.text(0.45, .1, "FN", size=100, horizontalalignment='right')
    plt.text(.95, .6, "FP", size=100, horizontalalignment='right')
    plt.text(.95, 0.1, "TP", size=100, horizontalalignment='right')
    plt.xticks([.25, .75], ["predicted negative", "predicted positive"], size=15)
    plt.yticks([.25, .75], ["positive class", "negative class"], size=15)
    plt.plot([.5, .5], [0, 1], '--', c='k')
    plt.plot([0, 1], [.5, .5], '--', c='k')
    plt.xlim(0, 1)
    plt.ylim(0, 1)

In [None]:
def print_clf_result(estimator,X_train,y_train,X_test = None, y_test = None):
    y_pred = estimator.predict(X_train)
    acc = round(accuracy_score(y_train, y_pred),2)
    print("Train Accuracy : ", acc)
    if X_test is None:
        y_pred_test = estimator.predict(X_train)
        plt.figure(figsize=(18,5))
        plt.subplot(121)
        plot_learning_curve(estimator,X_train,y_train)
        plt.subplot(122)
        plot_confusion_matrix(estimator,y_train,y_pred)   
    else:
        test_acc = round(estimator.score(X_test,y_test),2)
        print("Classification Accuracy :", test_acc)
        y_pred_test = estimator.predict(X_test)
        plt.figure(figsize=(18,5))
        plt.subplot(121)
        plot_learning_curve(estimator,X_train,y_train)
        plt.subplot(122)
        plot_confusion_matrix(estimator,y_test,y_pred_test)
    plt.show()

In [None]:
def make_clf_data(n_points, n_centers = 2, random_state = 42):
    n_features=2
    rnd_gen = np.random.RandomState(random_state)
    feature_names = ['feature' + str(x+1) for x in range(n_features)]
    X = pd.DataFrame(columns = feature_names)
    for center in range(n_centers):
        X = X.append(pd.DataFrame(rnd_gen.normal(loc=5*center, size=(n_points, n_features)), columns=feature_names),ignore_index=True)
    X['target'] = (X['feature1'] > 2)
    n_changes = int(n_points * 0.2)
    list_true = sample(X[X['target'] == True].index.to_list(), n_changes)
    X.loc[list_true,'target'] = False
    n_changes = int(n_points * 0.1)
    list_false = sample(X[X['target'] == False].index.to_list(),n_changes)
    X.loc[list_false,'target'] = True
    X = X.sample(frac=1).reset_index(drop=True)
    return X

In [None]:
def plot_decision_boundary(estimator,X,title = None):
    xx = np.linspace(-3, 9, 100)
    yy = np.linspace(-3, 9, 100)
    X1, X2 = np.meshgrid(xx, yy)
    X_grid = np.c_[X1.ravel(), X2.ravel()]
    decision_values = estimator.decision_function(X_grid)
    sns.scatterplot(x='feature1', y='feature2', hue='target', data = X)
    plt.contour(X1, X2, decision_values.reshape(X1.shape), colors="black",levels = 0)
    if title is not None:
        plt.title(title)


# Get Data

In [None]:
dfdata = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
dfdata.columns = dfdata.columns.str.lower()
dfdata.shape

In [None]:
dfdata.describe()

# Missing Values

In [None]:
dfdata.duplicated().sum()

In [None]:
dfdata.isnull().sum()

In [None]:
dfdata.info()

In [None]:
dfdata.nunique()

In [None]:
dfdata['seniorcitizen'] = dfdata['seniorcitizen'].map({0:'No',1:'Yes'})

In [None]:
dfdata['totalcharges_new'] = pd.to_numeric(dfdata['totalcharges'],errors = 'coerce')
dfdata[dfdata['totalcharges_new'].isnull() == True][['totalcharges','totalcharges_new']]

In [None]:
dfdata['totalcharges_new'].fillna(0,inplace = True)
dfdata['totalcharges'] = dfdata['totalcharges_new']
dfdata.drop('totalcharges_new',axis=1,inplace=True)

# EDA

In [None]:
dfdata.head()

## Target Variable

In [None]:
print(round(100 * dfdata['churn'].value_counts() / dfdata.shape[0],0))
plt.figure(figsize=(10,2))
sns.countplot(y='churn',data = dfdata)
plt.show()

## Numerical Features

In [None]:
numeric_features = dfdata.columns[dfdata.dtypes != 'object'].values.tolist()
categoric_features = dfdata.columns[dfdata.dtypes == 'object'].values.tolist()
categoric_features.remove('customerid')
categoric_features.remove('churn')
print("Categoric features : ",categoric_features)
print("Numeric features : ",numeric_features)

In [None]:
fig,ax = plt.subplots(1,3,figsize=(18,5))

sns.distplot(dfdata[dfdata['churn'] == 'No']['tenure'],label = 'No',ax=ax[0])
sns.distplot(dfdata[dfdata['churn'] == 'Yes']['tenure'],label = 'Yes',ax=ax[0])
ax[0].set_title('Tenure')
ax[0].legend()

sns.distplot(dfdata[dfdata['churn'] == 'No']['monthlycharges'],label = 'No',ax=ax[1])
sns.distplot(dfdata[dfdata['churn'] == 'Yes']['monthlycharges'],label = 'Yes',ax=ax[1])
ax[1].set_title('Monthly Charges')
ax[1].legend()

sns.distplot(dfdata[dfdata['churn'] == 'No']['totalcharges'],label = 'No',ax=ax[2])
sns.distplot(dfdata[dfdata['churn'] == 'Yes']['totalcharges'],label = 'Yes',ax=ax[2])
ax[2].set_title('Total Charges')
ax[2].legend()

plt.show()

- New customers until 20 months tend to churn more
- Customers monthly charges higher than 70 tend to churn more
- Total charges dont depend on churn. Seems to be non important

In [None]:
sns.pairplot(dfdata[numeric_features + ['churn']],hue = 'churn', diag_kind = 'kde')
plt.show()

In [None]:
bins = range(12,75,12)
dfdata['tenure_bin'] = np.digitize(dfdata['tenure'],bins,right = True)
dfdata['tenure_bin'] = dfdata['tenure_bin'].astype('category')
categoric_features.append('tenure_bin')
plot_categoric_feature('tenure_bin')

In [None]:
dfdata['meancharges'] = dfdata['totalcharges'] / dfdata['tenure']
numeric_features.append('meancharges')
fig,ax = plt.subplots(1,2,figsize=(18,4))
sns.scatterplot(x='monthlycharges',y='meancharges',data = dfdata,ax = ax[0])
sns.boxplot(x = 'tenure_bin',y = 'monthlycharges',data= dfdata,ax = ax[1])
plt.show()

In [None]:
dfdata['comparemean'] = dfdata['meancharges'] > dfdata['monthlycharges']
dfdata['comparemean'] = np.where(dfdata['meancharges'] == dfdata['monthlycharges'],'Equal',dfdata['comparemean'])
dfdata['comparemean'] = dfdata['comparemean'].astype('category')
plot_categoric_feature('comparemean')

- As tenure reaches to 3 years, customer gets extra discounts and average monthly charges decrease.
- At first two years, average monthly charges increase with tenure.

# Categorical Features

In [None]:
dfdata[categoric_features].nunique()

In [None]:
plot_categoric_feature('gender')

- Gender seems to have no effect on churn rate and monthly charges

In [None]:
plot_categoric_feature('seniorcitizen')

- Customers are generally young, old customers tend to churn and paying higher monthly charges

In [None]:
plot_categoric_feature('partner')

- Customers working without partner tend to churn. Nearly half of the customers work with partner

In [None]:
plot_categoric_feature('dependents')

- Customers who have no dependents tend to churn. Nearly 30% of customers have dependents.

In [None]:
plot_categoric_feature('phoneservice')

- 90% of customers use phone services but churn rate seems to be same among customers.
- Using phone services means higher monthly charges but it doesnt change churn rate so much.

In [None]:
pd.crosstab(dfdata['phoneservice'],dfdata['multiplelines'])

In [None]:
plot_categoric_feature('multiplelines')

- Phone services info is given in multiple lines feature. Churn rate seems to be same among customers using multiplelines

In [None]:
plot_categoric_feature('internetservice')

- Most of the customers use fiberoptic and they tend to churn more.
- Fiberoptic internet service is more expensive than DSL as expected.

In [None]:
plot_categoric_feature('onlinesecurity')

- 50% of customers dont use online security and they tend to churn more.
- Online security service seems to be free or so cheap for customers using internet service.

In [None]:
plot_categoric_feature('onlinebackup')

- 50% of customers dont use online backup and they tend to churn more.
- Online backup service seems to be free or so cheap for customers using internet service.

In [None]:
plot_categoric_feature('deviceprotection')

- 45% of customers dont use device protection and they tend to churn more.
- Device protection service seems to be so cheap for customers using internet service.

In [None]:
plot_categoric_feature('techsupport')

- 50% of customers dont use device protection and they tend to churn more

In [None]:
plot_categoric_feature('streamingtv')

In [None]:
plot_categoric_feature('streamingmovies')

- Using streamingmovies or streamingtv has little effect on churn rate.
- Prices seem to be nearly same for both services.

In [None]:
plot_categoric_feature('contract')

- Monthly contracts pay less and tend to churn more.

In [None]:
plot_categoric_feature('paperlessbilling')

- 60% of customers use paperlessbilling, they tend to pay more charges and tend to churn more.

In [None]:
plot_categoric_feature('paymentmethod')

- 20% of customers use mailed check, paying less and their churn rate is lower.
- Monthly charges dont change much with other payment methods.
- 30% of customers use electronic check and they tend to churn more.

- Gender, PhoneService, MultipleLines dont have a clear difference in distribution of churn rates.

In [None]:
dfdata['internet_fiber'] = np.where(dfdata['internetservice'] == 'Fiber optic','Yes','No')
dfdata['monthly_contract'] = np.where(dfdata['contract'] == 'Month-to-month','Yes','No')
dfdata['electronic_payment'] = np.where(dfdata['paymentmethod'] == 'Electronic check','Yes','No')
categoric_features.extend(['internet_fiber','monthly_contract','electronic_payment'])

In [None]:
dfdata['internet'] = np.where(dfdata['internetservice'] == 'No','No','Yes')
dfdata['num_services'] = (dfdata[['internet','onlinesecurity','onlinebackup','deviceprotection','techsupport','streamingtv','streamingmovies']] == 'Yes').sum(axis=1)
dfdata['num_services'] = dfdata['num_services'].astype('category')
plot_categoric_feature('num_services')
categoric_features.append('internet')
dfdata['num_services'] = dfdata['num_services'].astype('int')
numeric_features.append('num_services')


In [None]:
dfdata['monthly_mean_diff'] = (dfdata['monthlycharges'] - dfdata['monthlycharges'].mean()) / dfdata['monthlycharges'].mean()
numeric_features.append('monthly_mean_diff')
services_list = ['internetservice','onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport','streamingtv', 'streamingmovies']
for service in services_list:
    colname = service + '_mean_diff'
    dfdata[colname] = dfdata['monthlycharges'] / dfdata.groupby(service)['monthlycharges'].transform('mean')
    numeric_features.append(colname)

# Feature Selection

In [None]:
le = LabelEncoder()
encoded_features = []
for feature in categoric_features:
    colname = 'le_' + feature
    dfdata[colname] = le.fit_transform(dfdata[feature])
    encoded_features.append(colname)

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(dfdata[numeric_features].corr(),annot = True,fmt  ='.1g')
plt.show()

In [None]:
drop_list = ['meancharges','totalcharges','monthly_mean_diff','onlinebackup_mean_diff','deviceprotection_mean_diff','techsupport_mean_diff','streamingmovies_mean_diff']

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(dfdata[numeric_features].drop(drop_list,axis=1).corr(),annot = True,fmt  ='.1g')
plt.show()

In [None]:
numeric_features = [x for x in numeric_features if x not in drop_list]

In [None]:
dfdata.columns

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(dfdata[encoded_features].corr(),annot = True,fmt  ='.1g')
plt.show()

In [None]:
drop_list = ['le_phoneservice','le_contract','le_internet','le_tenure_bin']

In [None]:
plt.figure(figsize=(18,12))
sns.heatmap(dfdata[encoded_features].drop(drop_list, axis=1).corr(),annot = True,fmt  ='.1g')
plt.show()

In [None]:
categoric_features = [x for x in encoded_features if x not in drop_list and categoric_features]

In [None]:
[categoric_features + numeric_features]

# Logistic Regression
- Normal regression formula into sigmoid function
- Result is the probability of true, 1 if positive, 0 if negative
- Linear split of data space
- Selection of decision boundary is important
- Regression Assumptions
    - Handling of outlier data points
    - No perfect multicollinearity between the predictors (via VIF Factor or correlation)
- Regularization type and magnitude are important parameters.
- Uses logarithmic loss function to determine classes

<img src='https://ai-master.gitbooks.io/logistic-regression/assets/sigmoid_function.png' width='50%' height = '50%'>

In [None]:
dftemp = make_clf_data(100)
X = dftemp.drop('target',axis=1)
y = dftemp['target']
log_reg = LogisticRegression()
log_reg.fit(X, y)
print(np.round(log_reg.intercept_, 2), np.round(log_reg.coef_, 2))
print_clf_result(log_reg, X, y)

In [None]:
plot_decision_boundary(log_reg,dftemp)

In [None]:
c_list = [0.01,0.1,1,10]
plt.figure(figsize=(18,5))
for index, c_value in enumerate(c_list):
    title = 'C : ' + str(c_value)
    log_reg = LogisticRegression(C = c_value).fit(X,y)
    plt.subplot(1,len(c_list),index+1)
    plot_decision_boundary(log_reg, dftemp, title)
plt.show()

- low C parameter means, high regularization and high bias.
- high C parameter means, high risk of overfitting.

In [None]:
X = dfdata[numeric_features + categoric_features]
y = dfdata['churn'].map({'No':0,'Yes':1})
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)
print(np.round(log_reg.intercept_, 2), np.round(log_reg.coef_, 2))
print_clf_result(log_reg, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
c_list = [0.001,0.01,0.1,1,10,100,1000]
accuracy_list = []
coef_list = []
fig,ax = plt.subplots(1,2,figsize=(18,5))
for index, c_value in enumerate(c_list):
    log_reg = LogisticRegression(C = c_value).fit(X_train_scaled,y_train)
    y_pred = log_reg.predict(X_train_scaled)
    accuracy_list.append(log_reg.score(X_train_scaled,y_train))
    ax[0].plot(np.array(log_reg.coef_).ravel(),label = 'c:'+str(c_value))
    ax[0].legend()
    ax[0].set_title('Coefficients')
ax[1].plot(accuracy_list)
ax[1].set_title('Accuracy')
plt.xticks(range(len(c_list)),c_list)
plt.show()

In [None]:
c_list = [0.001,0.01,0.05,0.1,1,10]
score = []
n_zero_coefs = []
for c_value in c_list:
    log_reg = LogisticRegression(C=c_value, penalty='l1', solver='liblinear').fit(X_train,y_train)
    coef = np.round(log_reg.coef_,4)
    n_zero_coefs.append(len(coef[coef == 0]))
    score.append(round(log_reg.score(X_train, y_train),2))
    
dftemp = pd.DataFrame(zip(c_list, n_zero_coefs, score), columns = ['alpha','zero_coef','score'])
dftemp

In [None]:
estimator = LogisticRegression(C=0.05, penalty='l1', solver='liblinear')
log_reg = estimator.fit(X_train_scaled, y_train)
rfe = RFE(estimator,n_features_to_select=3).fit(X_train_scaled, y_train)
dftemp = pd.DataFrame(zip(X_train_scaled.columns.values,rfe.ranking_,log_reg.coef_.ravel()),columns = ['feature','rank','coef'])
dftemp.sort_values('rank', ascending=True, inplace=True)
dftemp.reset_index(inplace=True,drop=True)
dftemp.tail(10)

In [None]:
X_temp = X_train_scaled.drop(dftemp.iloc[-8:,0].values,axis = 1)
log_reg = LogisticRegression()
log_reg.fit(X_temp, y_train)
print(np.round(log_reg.intercept_, 2), np.round(log_reg.coef_, 2))
print_clf_result(log_reg, X_temp, y_train)

# SVM
- Linear and non linear support vector machines
- Split data via linear vectors
- Kernel trick : transformation of data to split data via linear vectors
- Uses hinge loss function to determine classes
- Hinge loss : Higher accuracy, worse probability analysis
- High performance on low feature, low volume data

In [None]:
X.drop(dftemp.iloc[-8:,0].values,axis = 1,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [None]:
svc = LinearSVC(C=0.05)
svc.fit(X_train_scaled,y_train)
print(np.round(svc.intercept_, 2), np.round(svc.coef_, 2))
print_clf_result(svc, X_train_scaled, y_train,X_test_scaled,y_test)

## SVC

In [None]:
svc = SVC(C=1, gamma=0.1, kernel = 'rbf')
svc.fit(X_train_scaled,y_train)
print_clf_result(svc, X_train_scaled, y_train,X_test_scaled,y_test)

# SGD Classifier
- Gradient Descent Method
- User defined loss functions can be used in addition to Hinge and Log loss functions
- Efficient on high volume data

In [None]:
sgd = SGDClassifier(loss='hinge', alpha=0.05, eta0=0.01)
sgd.fit(X_train_scaled, y_train)
print_clf_result(sgd, X_train_scaled, y_train, X_test_scaled, y_test)

# Naive Bayes Classifier
- Based on Bayes probability theorem
- Can be used for determination of baseline accuracy
- Fast training but worse generalization performance
- Best in low volume data
    - GaussianNB is used for continuos features
    - BernoulliNB is used for binary features
    - MultinomialNB is used for multi class features

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print_clf_result(gnb, X_train, y_train, X_test, y_test)

In [None]:
mnb = MultinomialNB(alpha=0.05)
mnb.fit(X_train, y_train)
print_clf_result(gnb, X_train, y_train, X_test, y_test)

In [None]:
bnb = BernoulliNB(alpha=0.05)
bnb.fit(X_train, y_train)
print_clf_result(bnb, X_train, y_train, X_test, y_test)

# Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(max_depth=5, random_state=12)
dt.fit(X_train_scaled, y_train)
print_clf_result(dt, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
dftemp = pd.DataFrame(zip(X.columns.values,dt.feature_importances_), columns = ['feature','importance'])
plt.figure(figsize=(18,5))
sns.barplot(x='importance', y='feature', data=dftemp)
plt.show()

# Uncertainty Estimates
- Decision function
- Probability Predictions
- Not supported all models
- Can be used via changing thresholds

In [None]:
dftemp = pd.DataFrame(np.round(100 * log_reg.predict_proba(X_test_scaled),0),columns = ['prob0','prob1'])
dftemp['y'] = y_test.values
dftemp['y_pred'] = log_reg.predict(X_test_scaled)
dftemp['decision'] = np.round(log_reg.decision_function(X_test_scaled), 2)
dftemp['error'] = np.abs(dftemp['y'] - dftemp['y_pred'])
print("Min and max decision function values : ",round(np.min(log_reg.decision_function(X_test_scaled)),2),round(np.max(log_reg.decision_function(X_test_scaled)),2))
dftemp.head()


In [None]:
bins = range(0,100,5)
dftemp['prob1bin'] = np.digitize(dftemp['prob1'],bins,right=True)
sns.barplot(x='prob1bin',y='error',data=dftemp)
plt.show()

# Imbalanced Data
- Balance data via data preparation methods, use of suitable classification algorithms or performance metrics
- Random Under Sampling
    - Discard majority class data
    - Information loss
- Random Over Sampling
    - Random generation of minority class data
    - Risk of overfit
- Cluster Based Over Sampling
    - Cluster minority and majority class data independently.
    - Random generation of data for each cluster.
    - Risk of overfit
- SMOTE (Synthetic Minority OverSampling Technique)
    - Random sub sample of minority class data
    - Generate synthetic data from random selected data via KNN
    - Not good performance on high volume data
- Imbalanced Data Classifiers
   - Ensemble Classifiers
   - Cost Censitive Classifiers
- Imbalanced Data Performance Metrics
    - F1 score
    - F2 score
    - ROC AUC - PR AUC
    - Precision and Recall
    - Accuracy and G-Mean

In [None]:
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train_scaled, y_train)
pred_most_frequent = dummy_majority.predict(X_test_scaled)
print("Test score: {:.2f}".format(dummy_majority.score(X_test_scaled, y_test)))

In [None]:
round(100 * dfdata['churn'].value_counts() / dfdata.shape[0], 0)

## Data Preparation Methods

In [None]:
rus = under_sampling.RandomUnderSampler()
X_rus, y_rus = rus.fit_sample(X_train_scaled,y_train)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_rus, y_rus)
print(np.round(log_reg.intercept_, 2), np.round(log_reg.coef_, 2))
print_clf_result(log_reg, X_rus, y_rus,X_test_scaled,y_test)

In [None]:
smote = over_sampling.SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_sample(X_train_scaled, y_train)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_sm, y_sm)
print(np.round(log_reg.intercept_, 2), np.round(log_reg.coef_, 2))
print_clf_result(log_reg, X_sm, y_sm,X_test_scaled,y_test)

## Balancing via Classifier

In [None]:
svc = SVC(C=1, gamma=0.1, kernel = 'rbf', class_weight='balanced', probability=True)
svc.fit(X_train_scaled, y_train)
print_clf_result(svc, X_train_scaled, y_train, X_test_scaled, y_test)

# Performance Metrics

## Confusion Matrix

In [None]:
plot_empty_confusion_matrix()

- Positive Class : Geri Dönen Müşteri
- Dönmeyecek müşteriyi döner olarak tahmin etmenin (FP) maliyeti =
- Geri dönen müşteriyi dönmez olarak tahmin etmenin (FN) maliyeti = 

## Common Metrics
\begin{equation}
\text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}
\end{equation}

\begin{equation}
\text{Precision (Positive Prediction Value)} = \frac{\text{TP}}{\text{TP} + \text{FP}}
\end{equation}

\begin{equation}
\text{Recall (Sensitivity, TPR)} = \frac{\text{TP}}{\text{TP} + \text{FN}}
\end{equation}

\begin{equation}
\text{FPR} = \frac{\text{FP}}{\text{FP} + \text{TN}}
\end{equation}

\begin{equation}
\text{Specificity} = \frac{\text{TN}}{\text{FP} + \text{TN}}
\end{equation}

\begin{equation}
\text{G-Mean} = (\text{Sensitivity} * \text{Specificity})
\end{equation}

\begin{equation}
\text{F1-Score} = 2 \cdot \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}}
\end{equation}

In [None]:
y_pred = log_reg.predict(X_test_scaled)
plt.figure(figsize=(2,2))
plot_confusion_matrix(log_reg, y_test, y_pred)
plt.show()
print(classification_report(y_test,y_pred))

In [None]:
y_pred_threshold = log_reg.decision_function(X_test_scaled) > -0.2
plt.figure(figsize=(2,2))
plot_confusion_matrix(log_reg, y_test, y_pred_threshold)
plt.show()
print(classification_report(y_test,y_pred_threshold))

In [None]:
y_pred_threshold = log_reg.predict_proba(X_test_scaled)[:,1] > 0.35
plt.figure(figsize=(2,2))
plot_confusion_matrix(log_reg, y_test, y_pred_threshold)
plt.show()
print(classification_report(y_test,y_pred_threshold))

## ROC Curve
- Tradeoff between recall and precision
- Find best threshold to optimize both
- Use in GridSearchCV, model, cross_val_score scoring parameter or test via different model hyperparameters

In [None]:
aps_logreg = round(average_precision_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1]),2)
aps_svc = round(average_precision_score(y_test, svc.decision_function(X_test_scaled)),2)
print("Average Precision Scores (log reg and svc) : ", aps_logreg, aps_svc)
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test, log_reg.decision_function(X_test_scaled))
close_zero_lr = np.argmin(np.abs(thresholds_lr))
plt.plot(precision_lr[close_zero_lr], recall_lr[close_zero_lr], 'o', markersize=10, label="threshold zero logreg", fillstyle="none", c='k', mew=2)
plt.plot(precision_lr, recall_lr, label="log reg")

precision_svc, recall_svc, thresholds_svc = precision_recall_curve(y_test, svc.decision_function(X_test_scaled))
close_zero_svc = np.argmin(np.abs(thresholds_svc))
plt.plot(precision_svc[close_zero_svc], recall_svc[close_zero_svc], 'v', markersize=10, label="threshold zero svc", fillstyle="none", c='k', mew=2)
plt.plot(precision_svc, recall_svc, label="svc")
plt.legend()
plt.title('Precision Recall Curve')
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.show()

In [None]:
plot_empty_confusion_matrix()

\begin{equation}
\text{TPR} = \frac{\text{TP}}{\text{TP} + \text{FN}}
\end{equation}

\begin{equation}
\text{FPR} = \frac{\text{FP}}{\text{FP} + \text{TN}}
\end{equation}


In [None]:
auc_logreg = roc_auc_score(y_test, log_reg.predict_proba(X_test_scaled)[:, 1])
auc_svc = roc_auc_score(y_test, svc.decision_function(X_test_scaled))
print("AUC scores (logreg and svc) : ", round(auc_logreg,2), round(auc_svc,2))

fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, log_reg.decision_function(X_test_scaled))
plt.plot(fpr_lr, tpr_lr, label="ROC LogReg")
close_zero_lr = np.argmin(np.abs(thresholds_lr))
plt.plot(fpr_lr[close_zero_lr], tpr_lr[close_zero_lr], 'o', markersize=10,label="threshold zero", fillstyle="none", c='k', mew=2)

fpr_svc, tpr_svc, thresholds_svc = roc_curve(y_test, svc.decision_function(X_test_scaled))
plt.plot(fpr_svc, tpr_svc, label="ROC SVC")
close_zero_svc = np.argmin(np.abs(thresholds_svc))
plt.plot(fpr_svc[close_zero_svc], tpr_svc[close_zero_svc], 'v', markersize=10, label="threshold zero", fillstyle="none", c='k', mew=2)

plt.title('ROC Curve')
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.legend(loc=4)
plt.show()