# Custumer Churn

In [None]:
# Data visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# MODEL
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Some functions used before and after the model
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, \
    classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

# Errors and data representation
import warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 150)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.simplefilter(action = "ignore")

In [None]:
df_ = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = df_.copy()

### Let's take a look at the data

In [None]:
df.head()

* **We can't use customerID so we can drop it**

In [None]:
df.drop('customerID', axis=1, inplace=True)

# Exploratory Data Analysis

In [None]:
df.info()

* **TotalCharges numeric column but it seems object and also SeniorCitizen categorical column but it seems numeric we need to fix this**

In [None]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype('O')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

* **Are there any missing values? Let's have a check**

In [None]:
df.isnull().sum()

* **Yes there are missing values in the TotalCharges. Let's have a look these**

In [None]:
df[df['TotalCharges'].isnull()]

* **Yes, these values are missing. But why is it missing? When I look at the tenure values, I see that the tenure values of these customers are 0. So I equate them to MonthlyCharges instead of subtracting them from the data.**

In [None]:
null_values = df[df['TotalCharges'].isnull()].index.to_list()
df.loc[df.index.isin(null_values), 
       'TotalCharges'] = df.loc[df.index.isin(null_values), 'MonthlyCharges']

In [None]:
df[df.index.isin(null_values)] # They are also new customer.

* **There is no missing value anymore**

In [None]:
df.isnull().sum()

* **I'm determining the columns types but first I'll create a function for this.**

In [None]:
def grab_cols(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns  # Categorical
                if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns
                   if dataframe[col].nunique() < cat_th and  # Numeric but categorical
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns
                   if dataframe[col].nunique() > car_th and  # Categorical but high cardinal
                   dataframe[col].dtypes == "O"]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes not in ['O', 'datetime64[ns]'] and
                dataframe[col].nunique() > 10]  # Numeric
    return cat_cols, num_but_cat, cat_but_car, num_cols

In [None]:
cat_cols, num_but_cat, cat_but_car, num_cols = grab_cols(df)

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
cat_but_car, num_but_cat # There is not.

* **I'll transform datatype object to numeric for target variable because of using correlation other variables**

In [None]:
df['Churn'] = np.where(df['Churn'] == 'Yes', 1, 0)

## Graphics

In [None]:
plt.figure(figsize=(12,6))
(pd.get_dummies(df).corr()['Churn']).sort_values(ascending=False).plot(kind='bar');

* **When we look at the graph, there is no correlation exceeding 40%, so it can be said that there is no serious correlation between the target variable and the independent variables.**

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='Reds')
plt.title('Correlation HeatMap');

* **When we look at the correlation graphs of numeric columns, there is a high correlation between tenure and TotalCharges, and there is also a correlation between MonthlyCharges and TotalCharges. But since it does not exceed 85% I won't drop one of both.**

In [None]:
df[num_cols].hist(figsize = (14,8), bins=12);

* **Numeric columns are not normally distributed. And there is also skewness and kurtosis.**

* **The distribution and graphics of categorical columns are as follows;**

In [None]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}), 
          end='\n\n\n')

    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.title(col_name)
        plt.xticks(rotation = 45)
        plt.show()

In [None]:
for col in cat_cols:
    cat_summary(df, col, plot=True)

**When I examined the graphs and distributions, I noticed that:**
* **The information of the users who do not have internet service exists in 7 different variables and their number is equal to the value of 1526.**
* **The information of customers who do not receive phone service is in 2 different variables and their number is equal to 682.**

**We need to pay attention to these properties when creating a dummy variables.**

dummy_diff = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']

**These columns have duplicate data that are no internet and no phone.**

# Some Descriptive Statistics

In [None]:
churn_to_contract = df.groupby(['Churn' ,'Contract'])\
.agg({'Contract': 'count'}).unstack().T.reset_index().drop('level_0', axis=1)
churn_to_contract.columns = ['Contract', 'Churn_0', 'Churn_1']
churn_to_contract['Churn_Ratio'] = churn_to_contract['Churn_1']*100\
/(churn_to_contract['Churn_0'] + churn_to_contract['Churn_1'])
churn_to_contract

* **The churn percentage of monthly subscriptions is quite high and 2-year subscriptions are also quite low. More campaigns can be made for monthly subscriptions. Or, these subscribers may be provided with an incentive for a 2-year contract.**

In [None]:
churn_to_internet = df.groupby(['Churn', 'InternetService']).agg({'InternetService':'count'}).unstack().T.reset_index().drop('level_0', axis=1)
churn_to_internet.columns = ['InternetService', 'Churn_0', 'Churn_1']
churn_to_internet['Churn_Ratio'] = churn_to_internet['Churn_1']*100 /(churn_to_internet['Churn_0'] + churn_to_internet['Churn_1'])
churn_to_internet.sort_values(by='Churn_Ratio', ascending=False)

* **The churn rate of fiber users is about 42%. Is there an infrastructure problem where these users are located? In addition, customers who are not internet users have a low churn rate, which shows that they do not have problems with other services.**

In [None]:
internet_to_billing = df.groupby(['InternetService', 'PaperlessBilling']).agg({'PaperlessBilling':'count'}).unstack().T.reset_index().drop('level_0', axis=1)
internet_to_billing.columns = ['PaperlessBilling', 'DSL', 'Fiber optic', 'No']
internet_to_billing

* **There are 446 people who pay their bills paperless and are not internet users, and internet service can be sold to these customers. Because they are probably a potential internet user.**

In [None]:
churn_to_payment = df.groupby(['Churn', 'PaymentMethod'])\
.agg({'Churn':'count'}).unstack().T.reset_index().drop('level_0', axis=1)
churn_to_payment.columns = ['PaymentMethod', 'Churn_0', 'Churn_1']
churn_to_payment['Churn_Ratio'] = churn_to_payment['Churn_1']*100\
/(churn_to_payment['Churn_0'] + churn_to_payment['Churn_1'])
churn_to_payment.sort_values(by='Churn_Ratio', ascending=False)

* **The churn rate of users who make electronic payments as a payment method is too high. Maybe there is a problem with this payment method.**

In [None]:
churn_to_citizen = df.groupby(['Churn', 'SeniorCitizen']).agg({'Churn':'count'}).unstack().T.reset_index().drop('level_0', axis=1)
churn_to_citizen.columns = ['SeniorCitizen', 'Churn_0', 'Churn_1']
churn_to_citizen['Churn_Ratio'] = churn_to_citizen['Churn_1']*100 /(churn_to_citizen['Churn_0'] + churn_to_citizen['Churn_1'])
churn_to_citizen

* **Churn ratio too high on SeniorCitizen variable. Maybe a special campaign can be made for these users.**

# Outliers

In [None]:
def outliers_threshold(dataframe, column):
    q1 = dataframe[column].quantile(0.05)
    q3 = dataframe[column].quantile(0.95)
    inter_quartile_range = q3 - q1
    low = q1 - 1.5 * inter_quartile_range
    up = q3 + 1.5 * inter_quartile_range
    return low, up

In [None]:
def grab_outlier(dataframe, column, index=False):
    low, up = outliers_threshold(dataframe, column)
    if dataframe[(dataframe[column] < low) |
                 (dataframe[column] > up)].shape[0] < 10:
        print(dataframe[(dataframe[column] < low) | (dataframe[column] > up)])
    else:
        print(dataframe[(dataframe[column] < low) |
                 (dataframe[column] > up)].head())
    if index:
        outlier_index = dataframe[(dataframe[column] < low) |
                                  (dataframe[column] > up)].index
        return outlier_index

In [None]:
for col in num_cols:
    grab_outlier(df[num_cols], col)

* **There is not an outlier observation in the numeric columns.**

# Feature Engineering

In [None]:
df[['tenure', 'MonthlyCharges']].describe().T

In [None]:
df[df['tenure'] == 0]['Churn']

* **I separate the tenure and MonthlyCharges values into different segments**
* **It seems that users with a tenure of 0 are not churn because they are new customers.**
* **I create a separate category for these values.**

In [None]:
df['new_tenure_segment'] = pd.cut(df['tenure'], bins=[0, 0.5, 30, 60, 100],
                                 labels=['New', 'Low', 'Medium', 'High']).astype('O')

In [None]:
df['new_MonhlyCharges_segment'] = pd.cut(df['MonthlyCharges'], bins=[0, 70, 90, 120],
                                 labels=['Low', 'Medium', 'High']).astype('O')

In [None]:
churn_to_segment = df.groupby(['Churn', 'new_tenure_segment'])\
.agg({'new_tenure_segment': 'count'}).unstack().T.reset_index().drop('level_0', axis=1)
churn_to_segment.columns = ['new_tenure_segment', 'Churn_0', 'Churn_1']
churn_to_segment['Churn_Ratio'] = churn_to_segment['Churn_1']*100\
/(churn_to_segment['Churn_0'] + churn_to_segment['Churn_1'])
churn_to_segment.sort_values('Churn_Ratio', ascending=False)

**When we look at the tenure segment, it is seen that users with high tenure values are less churn.**

In [None]:
churn_to_mcharges = df.groupby(['Churn', 'new_MonhlyCharges_segment'])\
.agg({'new_MonhlyCharges_segment': 'count'}).unstack().T.reset_index()\
.drop('level_0', axis=1)
churn_to_mcharges.columns = ['new_MonhlyCharges_segment', 'Churn_0', 'Churn_1']
churn_to_mcharges['Churn_Ratio'] = churn_to_mcharges['Churn_1']*100\
/(churn_to_mcharges['Churn_0'] + churn_to_mcharges['Churn_1'])
churn_to_mcharges.sort_values('Churn_Ratio', ascending=False)

**When we look at the MonthlyCharges category, it is seen that users with low monthly fees are less churn.**

* **When we look at the 'MonthlyCharges' and 'tenure' variables, we can notice that the 'tenure' value is more effective on the churn, but 'MonthlyCharge' is not that much.**

* **I create a score with these variables but I weighted them**

# Customer Segmentation

**First, I scale the tenure and MonthlyCharges variables over the same range. Because I'm going to generate a score using both.**

In [None]:
scaler = MinMaxScaler(feature_range=(1, 10))
scaler.fit(df[['tenure', 'MonthlyCharges']])
df[['tenure_scaled', 'MonthlyCharges_scaled']] = scaler\
.transform(df[['tenure', 'MonthlyCharges']])

In [None]:
# New scaled range 1-10
df[['tenure', 'tenure_scaled', 'MonthlyCharges', 'MonthlyCharges_scaled']].head(10)

**Now I'll create segment_score_1 to segment new scores**

In [None]:
df['segment_score_1'] = (0.65 * df['tenure_scaled'] + 0.35 * df['MonthlyCharges_scaled'])
df['segment_1'] = pd.qcut(df['segment_score_1'], 5, 
                          labels=['E', 'D', 'C', 'B', 'A']).astype('O')

In [None]:
df[['segment_score_1', 'segment_1']].head(10)

In [None]:
churn_to_segment_1 = df.groupby(['Churn', 'segment_1'])\
.agg({'segment_1':'count'}).unstack().T.reset_index().drop('level_0', axis=1)
churn_to_segment_1.columns = ['segment_1', 'Churn_0', 'Churn_1']
churn_to_segment_1['Churn_Ratio'] = churn_to_segment_1['Churn_1']*100\
/(churn_to_segment_1['Churn_0'] + churn_to_segment_1['Churn_1'])
churn_to_segment_1.sort_values('Churn_Ratio', ascending=True)

**As can be seen, the churn values decrease as the segment values increase.**

In [None]:
df.head()

In [None]:
# I'll drop these columns. I won't use them anymore.
df.drop(['tenure_scaled', 'segment_score_1', 'MonthlyCharges_scaled', 'tenure', 'MonthlyCharges'], 
        axis=1, inplace=True)

In [None]:
df['Churn'] = df['Churn'].astype('O')

# Encoding

**I created columns type for encoding process.**

In [None]:
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
binary_cols

In [None]:
multi_col = [col for col in df.columns if df[col].nunique() > 2 and df[col].dtypes == 'O']
multi_col

In [None]:
num_col = [col for col in df.columns if df[col].dtypes in ['float64', 'int64']]
num_col

### Functions

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = preprocessing.LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols,
                               drop_first=drop_first)
    return dataframe

### Label Encoding

In [None]:
for col in binary_cols:
    label_encoder(df, col)

### One-Hot Encoding (with two step)

In [None]:
# These columns include same information that are no internet service and no phone service
dummy_diff = ['OnlineSecurity', 'OnlineBackup', 
              'DeviceProtection', 'TechSupport', 
              'StreamingTV', 'StreamingMovies', 
              'MultipleLines']

In [None]:
df = one_hot_encoder(df, dummy_diff, drop_first=False)

In [None]:
df.head()

**Now, I creating a droplist to drop duplicated dummy features**

In [None]:
drop_list = df.columns[df.columns.str.contains(pat = 'No internet service')].to_list()
drop_list.append('MultipleLines_No phone service')
drop_list

In [None]:
df.drop(drop_list, axis=1, inplace=True)

**Now, I'll convert remaining properties to dummy variable but i will set drop first argument to True**

In [None]:
# Difference list
dummy = list(set(multi_col) - set(dummy_diff))
dummy

In [None]:
df = one_hot_encoder(df, dummy, drop_first=True)

In [None]:
df.head()

# Split Data

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, random_state=101)

# Scaling

**I scale train and test data separately. Because I don't want the test data to be affected by the bias of the train data.**

In [None]:
scaler = StandardScaler().fit(X_train[num_col])
X_train[num_col] = scaler.transform(X_train[num_col])

scaler = StandardScaler().fit(X_test[num_col])
X_test[num_col] = scaler.transform(X_test[num_col])

In [None]:
X_train.shape, X_test.shape

# Model

### But First

**I'm checking to balance of target data**

In [None]:
print(' Churn No Ratio: ', round(df['Churn'].value_counts()[0]/df.shape[0] ,2), '\n', 
      'Churn Yes Ratio: ', round(df['Churn'].value_counts()[1]/df.shape[0] ,2))

**We can say distribution of Churn imbalanced but first let's have a look models.**

### Unvalidated raw model results

In [None]:
models = [('LR', LogisticRegression(solver='liblinear')),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVC', SVC(gamma='auto')),
          ('GB',GradientBoostingClassifier()),
          ("LightGBM", LGBMClassifier())]

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=42)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

**Yes, generally we can say good according to accuracy but we need to see other model success metrics**

## Logistic Regression

In [None]:
loj = LogisticRegression(solver='liblinear')
loj_model = loj.fit(X_train, y_train)
y_pred = loj_model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

**Precision and recall are quite low**

In [None]:
LR = [0.81, 0.84, 0.91, 0.88, 0.70, 0.53, 0.61]

In [None]:
cross_val_score(loj_model, X_test, y_test, cv=10).mean() # Model Tuned

In [None]:
logit_roc_auc = roc_auc_score(y_test, loj_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, loj_model.predict_proba(X_test)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.1])
plt.xlabel('False Positive Ratio')
plt.ylabel('True Positive Ratio')
plt.title('ROC_Logistic')
plt.show();

## GB (GRADIENT BOOSTING)

In [None]:
gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.05],
             "n_estimators": [100,500,1000],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}

gbm = GradientBoostingClassifier()

gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
gbm_cv.fit(X_train, y_train)

In [None]:
print('Best scor : ' + str(gbm_cv.best_score_))
print('Best parameters : ' + str(gbm_cv.best_params_))

In [None]:
gbm_tuned = GradientBoostingClassifier(learning_rate=0.05,
                                      max_depth=5,
                                      min_samples_split=10,
                                      n_estimators=100)
gbm_tuned.fit(X_train, y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred) # Model Tuned

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
GB = [0.80, 0.83, 0.91, 0.87, 0.69, 0.51, 0.59]

In [None]:
logit_roc_auc = roc_auc_score(y_test, gbm_tuned.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, gbm_tuned.predict_proba(X_test)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.1])
plt.xlabel('False Positive Ratio')
plt.ylabel('True Positive Ratio')
plt.title('ROC_GB')
plt.show();

In [None]:
conclusion_1 = pd.DataFrame({'LR': LR,
                             'GB': GB}, index=['accuracy', 'precision_0', 
                                               'recall_0', 'f_1_score_0',
                                               'precision_1', 'recall_1', 
                                               'f_1_score_1']).T

**When we only look at the accuracy values, the success rate of 81% can be seen quite nicely. However, the correct prediction rate of churn ones is quite low, and we can understand this by looking at f1_score and other metrics. This shows that the data set is unbalanced on the basis of the target variable.**

**What needs to be done to correct this imbalance?**

* **Adding other customers who are churn to the data**

* **Establishing a model by taking a sample close to the number of people who can't be churn**

* **To provide balance by giving certain weights before creating the model**

# Model With Sample

In [None]:
df = df_.copy()

**I choose a random sample from non-churn data.**

In [None]:
df.drop('customerID', axis=1, inplace=True)
df_1 = df[df['Churn'] == 'No'].sample(n=2000, random_state=1)
df_2 = df[df['Churn'] == 'Yes']
df = pd.concat([df_1, df_2]).reset_index(drop=True)

In [None]:
df['Churn'].value_counts()

In [None]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype('O')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
null_values = df[df['TotalCharges'].isnull()].index.to_list()
df.loc[df.index.isin(null_values), 'TotalCharges'] = df.loc[df.index\
                                                            .isin(null_values),
                                                            'MonthlyCharges']

In [None]:
cat_col = [col for col in df.columns if df[col].dtypes == 'O']
num_col = [col for col in df.columns if df[col].dtypes != 'O']

In [None]:
cat_col

In [None]:
num_col

## Feature Engineering

In [None]:
df['new_tenure_segment'] = pd.cut(df['tenure'], bins=[0, 0.5, 30, 60, 100],
                                 labels=['New', 'Low', 'Medium', 'High']).astype('O')
df['new_MonhlyCharges_segment'] = pd.cut(df['MonthlyCharges'], bins=[0, 70, 90, 120],
                                 labels=['Low', 'Medium', 'High']).astype('O')

In [None]:
scaler = MinMaxScaler(feature_range=(1, 10))
scaler.fit(df[['tenure', 'MonthlyCharges']])
df[['tenure_scaled', 'MonthlyCharges_scaled']] = scaler.transform(df[['tenure', 'MonthlyCharges']])
df['segment_score_1'] = (0.65 * df['tenure_scaled'] + 0.35 * df['MonthlyCharges_scaled'])
df['segment_1'] = pd.qcut(df['segment_score_1'], 5, labels=['E', 'D', 'C', 'B', 'A']).astype('O')
df.drop(['tenure_scaled', 'segment_score_1', 'MonthlyCharges_scaled',
        'tenure', 'MonthlyCharges'], axis=1, inplace=True)

In [None]:
binary_cols = [col for col in df.columns if df[col].nunique() == 2]
multi_col = [col for col in df.columns if df[col].nunique() > 2 and df[col].dtypes == 'O']
num_col = [col for col in df.columns if df[col].dtypes in ['float64', 'int64']]

In [None]:
for col in binary_cols:
    label_encoder(df, col)

In [None]:
df = one_hot_encoder(df, dummy_diff, drop_first=False)
drop_list = df.columns[df.columns.str.contains(pat = 'No internet service')].to_list()
drop_list.append('MultipleLines_No phone service')
df.drop(drop_list, axis=1, inplace=True)
df = one_hot_encoder(df, dummy, drop_first=True)

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [None]:
num_col = ['TotalCharges']

In [None]:
scaler = StandardScaler().fit(X_train[num_col])
X_train[num_col] = scaler.transform(X_train[num_col])
scaler = StandardScaler().fit(X_test[num_col])
X_test[num_col] = scaler.transform(X_test[num_col])

## Logistic Regression

In [None]:
loj = LogisticRegression(solver='liblinear')
loj_model = loj.fit(X_train, y_train)
y_pred = loj_model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
LR = [0.77, 0.81, 0.74, 0.77, 0.74, 0.81, 0.77]

In [None]:
cross_val_score(loj_model, X_test, y_test, cv=10).mean() # Tuned Model

In [None]:
logit_roc_auc = roc_auc_score(y_test, loj_model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, loj_model.predict_proba(X_test)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.1])
plt.xlabel('False Positive Ratio')
plt.ylabel('True Positive Ratio')
plt.title('ROC_Logistic')
plt.show();

# GB (Gradient Boosting)

In [None]:
gbm_params = {"learning_rate" : [0.001, 0.01, 0.1, 0.05],
             "n_estimators": [100, 500, 1000],
             "max_depth": [3,5,10],
             "min_samples_split": [2,5,10]}

gbm = GradientBoostingClassifier()

gbm_cv = GridSearchCV(gbm, gbm_params, cv = 10, n_jobs = -1, verbose = 2)

In [None]:
gbm_cv.fit(X_train, y_train)

In [None]:
print('Best score : ' + str(gbm_cv.best_score_))
print('Best parameters : ' + str(gbm_cv.best_params_))

In [None]:
gbm_tuned = GradientBoostingClassifier(learning_rate=0.01,
                                      max_depth=5,
                                      min_samples_split=10,
                                      n_estimators=500)
gbm_tuned.fit(X_train, y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred) # Model Tuned

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
GB = [0.74, 0.75, 0.75, 0.75, 0.74, 0.74, 0.74]

In [None]:
logit_roc_auc = roc_auc_score(y_test, gbm_tuned.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, gbm_tuned.predict_proba(X_test)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.1])
plt.xlabel('False Positive Ratio')
plt.ylabel('True Positive Ratio')
plt.title('ROC_GB')
plt.show();

In [None]:
conclusion_2 = pd.DataFrame({'LR': LR,
                        'GB': GB,}, index=['accuracy', 'precision_0', 
                                        'recall_0', 'f_1_score_0',
                                        'precision_1', 'recall_1', 
                                        'f_1_score_1']).T

In [None]:
conclusion_1

In [None]:
conclusion_2

**Yes, accuracy has decreased, but recall, f1_score, and precision success metrics have increased considerably. These metrics are much more important to us as our goal is churn estimation.**