In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Churn Prediction Assignment

This notebooks includes the whole step by step ranking algorithm creation for solving Conrad's technical task.

# General Settings

## Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
import seaborn
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score

## Support Methods

In [None]:
def print_info_about_data_frame(df):
    """
    Print overall df stats.
    :param df: DataFrame. Data to have its stats printed.
    """
    print(f"DataFrame type:")
    print(f"{str(type(df))}")
    print("DataFrame shape:")
    print(f"{str(df.shape)}")
    print(f"DataFrame dtypes:")
    print(f"{df.dtypes}")
    print("\n")
    print(f"DataFrame head:")
    print(f"{df.head()}")
    print("\n")
    print(f"DataFrame description:")
    print(f"{df.describe()}")
    print("\n")


def get_nan_stats(df, fraction = False):
    """
    Prints the number of NaN values in the DataFrame and per attribute.
    :param df: DataFrame.
    :param fraction: bool. If True, the % of NaN values is shown with respect to the total number of rows.
    """
    print("DataFrame shape:", df.shape)
    print("Total number of NaN values:", df.isna().sum().sum())
    print("NaN values per Attribute:", "\n")
    if fraction:
        tabs = [df.columns.tolist(), df.isna().sum().tolist(), (df.isna().sum() / df.shape[0]).round(2).tolist()]
        total = ["TOTAL", df.isna().sum().sum(), (df.isna().sum() / df.shape[0]).round(2).sum()]
        for i, tab in enumerate(tabs):
            tab.append(total[i])
        print(tabulate(zip(*tabs), headers=["Attribute", "NaN values", "Fraction"]))
    else:
        print(df.isna().sum())


def print_single_attr_stats_without_plot(df, attr_name, min_unique = 20):
    """
    Print and plot individual stats of the df attributes.
    :param df: DataFrame. Data to have its attribute stats printed.
    :param attr_name: str. Name of the attribute to be analyzed.
    :param min_unique: int. Used to set the maximum allowed unique values for individual columns to be printed and
    plotted.
    """
    print(f"Attribute Name: {attr_name}")
    print(f" Attribute type: {df[attr_name].dtype}")
    print(f" Number of Null values: {df[[attr_name]].isnull().sum()[0]}")
    print(f" Number of unique values is:{len(df[attr_name].value_counts())}")
    print(f" Percentage of unique values is: {len(df[attr_name].value_counts()) / df.shape[0]}")
    if len(df[attr_name].value_counts()) < min_unique:
        pom = df[attr_name].value_counts()
        print("\n")
        print("Summation of unique values per ID:")
        print(pom)
    print("\n")

def print_attr_stats(df, min_unique = 20):
    """
    Prints df columns stats.
    :param df: DataFrame. Data to have its column stats printed.
    :param min_unique: int. Used to set the maximum allowed unique values for individual columns to be printed and
    plotted.
    """
    for attr in df.columns:
        print_single_attr_stats_without_plot(df, attr, min_unique)
        print("#############################################")
        print("\n")
        
def get_memory_usage(df, attr_name, list_dtypes, deep = True):
    """
    Returns the memory usage of an attribute of a DataFrame upon the type of dtype of the values.
    :param df: DataFrame.
    :param attr_name: str. Name of the attribute to which the measurement is done.
    :param list_dtypes: List[str]. List of dtypes to be measured.
    :param deep: bool. If True, introspect the data deeply by interrogating object dtypes for system-level memory
    consumption.
    """
    print(f"Memory usage for attribute: {attr_name}")
    for data_type in list_dtypes:
        print(f"  Attribute Name: {attr_name}")
        print(f"  Measured dtype: {data_type}")
        try:
            print("  Memory Usage:", df[attr_name].astype(data_type).memory_usage(deep=deep))
        # pylint: disable=broad-except
        except Exception as exc:
            print(exc)
        # pylint: enable=broad-except
        if data_type != list_dtypes[-1]:
            print("\n")
    print("\n")
    
def change_attribute_dtype(df, attr_name, dtype_name):
    """
    Changes the dtype of a data frame attribute.
    :param df: DataFrame.
    :param attr_name: str.
    :param dtype_name: str.
    """
    df[attr_name] = df[attr_name].astype(dtype_name)
    
def plot_bar_chart(df, attr_name, return_trace=False):
    """
    Plots a bar chart of a specific attribute.
    :param df: DataFrame.
    :param attr_name: str. Name of the attribute to be plotted.
    :param return_trace: bool. Default is False. If true, instead of plotting, it returns the trace. Used for subplots.
    :return: go.Bar.
    """
    df = df[[attr_name]].copy()
    df["UNIQUE_COUNT"] = df.groupby([attr_name])[attr_name].transform("count")
    df = df.drop_duplicates(subset=attr_name).reset_index(drop=True)
    trace = go.Bar(x=df[attr_name],y=df["UNIQUE_COUNT"], name=attr_name)
    layout = dict(xaxis_title=attr_name, yaxis_title="UNIQUE_COUNT", barmode="group",
                  xaxis=dict(categoryorder="total ascending"))
    if not return_trace:
        fig = go.Figure(data=trace, layout=layout)
        py.iplot(fig)
    else:
        return trace
    
def plot_histogram(df, attr_name, name=None, show_legend=True, return_trace=False):
    """
    Plots a histogram of a given attribute.
    :param df: DataFrame.
    :param attr_name: str.
    :param name: str. Name of the trace.
    :param show_legend: bool. True to show legend of the trace.
    :param return_trace: bool. Default is False. If true, instead of plotting, it returns the trace. Used for subplots.
    :return: go.Histogram.
    """
    df = df[[attr_name]].copy()
    trace = go.Histogram(x=df[attr_name],nbinsx=0, marker_line_color="black", marker_line_width=1, 
                         name=name if name is not None else attr_name, showlegend=show_legend)
    layout = dict(xaxis_title=attr_name)
    if not return_trace:
        fig = go.Figure(data=trace, layout=layout)
        py.iplot(fig)
    else:
        return trace
    
def plot_stack_bar_chart(df, common_attr, attr_name, show_legend=True, return_trace=False):
    """
    Plots a stacked bar of two attributes.
    :param df: DataFrame.
    :param common_attr: str. X axis attribute.
    :param attr_name: str. Y axis attribute.
    :param show_legend: bool. True to show legend.
    :param return_trace: bool. Default is False. If true, instead of plotting, it returns the trace. Used for subplots.
    :return: go.Bar.
    """
    traces = []
    df_attr = df[[common_attr, attr_name]].copy()
    df_attr = df_attr.groupby([common_attr, attr_name]).size().unstack()
    for col_name in df_attr.columns.tolist():
        trace = go.Bar(x=df_attr.index, y=df_attr[col_name], name=col_name, showlegend=show_legend, \
                       hovertemplate = " %{value}, " + attr_name)
        traces.append(trace)
    
    fig = go.Figure(data=traces)
    fig.update_layout(barmode='stack', xaxis_title=common_attr)
    if not return_trace:
        fig = go.Figure(data=trace, layout=layout)
        py.iplot(fig)
    else:
        return traces
    
def multiple_bar_charts(df, rows, cols, attr_list, height, width, title, subplot_titles=False):
    """
    Plots multiple bar plots with plotly subplots.
    :param df: df.
    :param rows: int.
    :param cols: int.
    :param attr_list: List[str]. List of attributes to plot.
    :param height: int.
    :param width: int.
    :param title: str.
    :param subplot_titles: bool. Default is False. If True the the attributes list is used.
    """
    
    if subplot_titles:
        fig = make_subplots(rows=rows, cols=cols, subplot_titles=customer_attrs, shared_yaxes=True)
    else: 
        fig = make_subplots(rows=rows, cols=cols)
    i = 0
    for row in range(1, rows + 1):
        for col in range(1, cols + 1):
            fig.add_trace(plot_bar_chart(df, attr_list[i], True), row=row, col=col)
            fig.update_xaxes(title_text=attr_list[i], row=row, col=col)
            fig.update_yaxes(range=[0, 7000], row=row, col=col)
            i += 1

    fig.update_layout(height=height, width=width, title_text=title)
    fig.show()

## Constants

In [None]:
dtypes_list = ["object", "category"]
binary_dict = {"Yes": 1, "No": 0}
inverse_binary_dict = {1: "Yes", 0: "No"}
all_dtypes = {"gender":"object","SeniorCitizen":"int64","Partner":"category","Dependents":"category",
              "tenure":"int64", "PhoneService":"category", "MultipleLines":"category", "InternetService":"category",
              "OnlineSecurity": "category", "OnlineBackup": "category", "DeviceProtection": "category",
              "TechSupport": "category", "StreamingTV": "category", "StreamingMovies": "category", "Contract": "category",
              "PaperlessBilling": "category", "PaymentMethod": "category", "MonthlyCharges": "float64",
              "TotalCharges": "float64", "Churn": "category"}

# Data Exploration & Preprocessing

## Load data

In [None]:
df_churn = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

## In Depth Raw Data EDA

In [None]:
df_customer_churn = df_churn.copy()

### Dataset WA_Fn-UseC_-Telco-Customer-Churn

* Non NaN values are found in the dataset.
* The size of the data frame is (7043, 21).

In [None]:
print_info_about_data_frame(df_customer_churn)

get_nan_stats(df_customer_churn)

#### Attribute customerID

* No preprocessing is applied to this attribute.
* The currect dtype (object) is the most efficient.
* There are 7043 unique customerID. In other words, there is a unque entry in the dataset for each customerID.
* This attribute will be dropped since it does not carry any information useful for modelling.

In [None]:
attr_name = "customerID"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

#### Attribute gender

* Attribute is binary (Male/Female).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "gender"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute SeniorCitizen

* Attribute is binary (1/0). For EDA purposes, the values will be changed to Yes/No, however for modeling they will remain the same.
* No preprocessing is applied to this attribute.

In [None]:
attr_name = "SeniorCitizen"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute Partner

* Attribute is binary (Yes/No).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "Partner"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute Dependents

* Attribute is binary (Yes/No).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "Dependents"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute tenure
* No preprocessing is applied to this attribute.

In [None]:
attr_name = "tenure"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_histogram(df_customer_churn, attr_name)

#### Attribute PhoneService

* Attribute is binary (Yes/No).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "PhoneService"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute MultipleLines

* Attribute has three categories (Yes/No/No phone service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "MultipleLines"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute InternetService

* Attribute has three categories (Fyber optic/DSL/No).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "InternetService"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute OnlineSecurity

* Attribute has three categories (No/Yes/No internet service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "OnlineSecurity"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute OnlineBackup

* Attribute has three categories (No/Yes/No internet service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "OnlineBackup"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute DeviceProtection

* Attribute has three categories (No/Yes/No internet service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "DeviceProtection"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute TechSupport

* Attribute has three categories (No/Yes/No internet service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "TechSupport"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn.copy(), attr_name)

#### Attribute StreamingTV

* Attribute has three categories (No/Yes/No internet service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "StreamingTV"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute StreamingMovies

* Attribute has three categories (No/Yes/No internet service).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "StreamingMovies"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute Contract

* Attribute has three categories (Month-to-month/Two year/One year).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "Contract"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute PaperlessBilling

* Attribute is binary (Yes/No).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "PaperlessBilling"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute PaymentMethod

* Attribute has four categories (Electronic check/Mailed check/Bank transfer (automatic)/Credit card (automatic)).
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "PaymentMethod"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

#### Attribute MonthlyCharges

* No preprocessing is applied to this attribute.

In [None]:
attr_name = "MonthlyCharges"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_histogram(df_customer_churn, attr_name)

#### Attribute TotalCharges

* Almost all elements are strings of float numbers. 
* There are 11 strings that are an empty character (" "). These rows are dropped.
* Dtype is changed to float64.

In [None]:
attr_name = "TotalCharges"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

df_customer_churn[attr_name] = pd.to_numeric(df_customer_churn[attr_name], errors="coerce", downcast="float")  # Change string values to float numbers
df_customer_churn_drop_na = df_customer_churn.dropna() 

print_single_attr_stats_without_plot(df_customer_churn_drop_na, attr_name)

plot_histogram(df_customer_churn_drop_na, attr_name)

* The dropped rows are the following:

In [None]:
df_customer_churn_only_nan = df_customer_churn.drop(df_customer_churn_drop_na.index)
df_customer_churn_only_nan.head(11)

#### Attribute Churn

* Attribute is binary (No/Yes)
* Dtype is changed to a much more efficient category dtype.

In [None]:
attr_name = "Churn"
print_single_attr_stats_without_plot(df_customer_churn, attr_name)

get_memory_usage(df_customer_churn, attr_name, dtypes_list)

change_attribute_dtype(df_customer_churn, attr_name, dtypes_list[1])

print_single_attr_stats_without_plot(df_customer_churn, attr_name)

plot_bar_chart(df_customer_churn, attr_name)

### Final Preprocessing - Clean data

* Based on the preprocessing, dtypes are changed and rows including empty strings are dropped.
* The attribute curstomerID is dropped since it does not carry any necessary information for modelling.
* The final shape of the preprocessed data is (7032, 20).
* All attribute names are made upper case.

_Note: Encoding, although technically part of Preprocessing has been moved to model selection and training section._

In [None]:
# Rows with empty strings (" ") are dropped.
df_churn["TotalCharges"] = pd.to_numeric(df_churn["TotalCharges"], errors="coerce", downcast="float")
df_churn.dropna(inplace=True)

# custumer_id attribute is dropped for simplicity
customer_id = df_churn["customerID"].values
df_churn = df_churn.iloc[:,1:]

# Format dtype is modified
for key, value in all_dtypes.items():
    df_churn[key] = df_churn[key].astype(value)

# Column names are made upper case words.
df_churn.columns = df_churn.columns.str.upper()

print_info_about_data_frame(df_churn)

print_attr_stats(df_churn)

## Exploratory Data Analysis

* EDA of the cleaned data.
* Provides a look into the dataset and how its different attributes relate around them.
* Divided in Customer, Hired Servicies and Financial and Contractual attributes.
* Churn is analyzed individually.

### Customer Attributes

* Attributes gender, SeniorCitizen, Partner and Dependents are associated to the personal status of each customer.
* Partner and gender are almost 50% distributed, Males (3549 vs 3483) and No partner (3639 vs 3393) dominate respectively.
* On the other side, SeniorCitizen and Dependents show a significant difference. In the first, non-senior dominate (5890) and in the latter non-dependents (4933).

_Note: For readability reasons, the values in SeniorCitizen are changed to Yes (1) and No (0) strings._

In [None]:
customer_attrs = ["GENDER", "SENIORCITIZEN", "PARTNER", "DEPENDENTS", "CHURN"]  # Churn is kept for later use.
df_customer_attributes = df_churn[customer_attrs].copy()
df_customer_attributes["SENIORCITIZEN"] = df_customer_attributes["SENIORCITIZEN"].map(inverse_binary_dict)
    
multiple_bar_charts(df_customer_attributes, 1, 4, customer_attrs, 400, 1000, "Customer Attributes", False)

* Gender and Partner are almost equall distributed with regard to SeniorCitizen.
* For non-senior citizens, 65.91% do not have dependents. For senior citizens the large majority also does not have dependents (92.03%).

In [None]:
sub_attr_list = ["GENDER", "PARTNER", "DEPENDENTS"]
attr = "SENIORCITIZEN"
fig = make_subplots(rows=1, cols=3, subplot_titles=sub_attr_list)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[1], True, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[2], False, True)[0], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[2], False, True)[1], row=1, col=3)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=1, col=3)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "Male" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if "Female" in trace.name else (),)

 
fig.update_layout(height=400, width=1000, barmode="stack", title_text="SENIORCITIZEN Distribution")
fig.show()

* Gender does not play a significant role in determining if the custumor has dependents or not. Both genders are distributed equally distributed (Males slightly larger than Female in both cases).
* Partner is significant with respect to customer dependency. For customers with no dependents, those without a partner are the largest (66.49%). However, when the customers do have dependents, those with a partner do dominate (82.9%).
* As we saw before, non-senior citizens are both the majority when there are (92.03%) and are not dependents (65.91%).

In [None]:
sub_attr_list = ["GENDER", "PARTNER", "SENIORCITIZEN"]
attr = "DEPENDENTS"
fig = make_subplots(rows=1, cols=3, subplot_titles=sub_attr_list)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[1], True, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[2], False, True)[0], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[2], False, True)[1], row=1, col=3)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=1, col=3)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "Male" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if "Female" in trace.name else (),)

 
fig.update_layout(height=400, width=1000, barmode="stack", title_text="DEPENDENTS Distribution")
fig.show()

* Gender shows an almost equal distribution independently if the customers have or do not have a partner.
* As seen above, customers with partners have in their majority have dependents (66.49%). This in contrast with those clients without partners, since a large majority of them do not have dependents (82.9%).
* As we saw before, non-senior citizens are both the majority when there are (92.03%) and are not (65.91%) dependents.

In [None]:
sub_attr_list = ["GENDER", "DEPENDENTS", "SENIORCITIZEN"]
attr = "PARTNER"
fig = make_subplots(rows=1, cols=3, subplot_titles=sub_attr_list)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[1], True, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[2], False, True)[0], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, sub_attr_list[2], False, True)[1], row=1, col=3)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=1, col=3)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "Male" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if "Female" in trace.name else (),)

 
fig.update_layout(height=400, width=1000, barmode="stack", title_text="PARTNER Distribution")
fig.show()

### Hired Services Attributes

* PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies are considered hired services attributes.
* The large majority of clients do have a phone service contracted (6352 vs 680). Those that have, a slight majority do not have multiple lines (3385 vs 2967)
* 1520 customers do not have internet service. Of those that have hired an internet service, the majority have optic fiber (3096). The test have DSL (2416).
* Online secutiry, device protection, online backup and tech support follow a similar distribution, in which the majority of consumers have not hired these services compared to those that have (~60% of clients with internet service).
* Regarding the available streaming services (TV and movies), clients with internet services do not have a preference (No slightly dominates with 50.96% and 50.45% respectively).

In [None]:
services_attrs = ["PHONESERVICE", "MULTIPLELINES", "INTERNETSERVICE", "ONLINESECURITY", "ONLINEBACKUP", "DEVICEPROTECTION", 
             "TECHSUPPORT", "STREAMINGTV", "STREAMINGMOVIES", "CHURN"]  # Churn is kept for later use.

df_services_attributes = df_churn[services_attrs].copy()

multiple_bar_charts(df_services_attributes, 3, 3, services_attrs, 800, 1100, "Hired Services Attributes", False)

* With regard to how the internet services are distributed with regard to the phone service, surprisingly we find that all customers with fiber optic have acquired phone service. This is in contrast with DSL,

In [None]:
sub_attr_list = ["MULTIPLELINES", "INTERNETSERVICE"]
attr = "PHONESERVICE"

fig = make_subplots(rows=1, cols=2, subplot_titles=sub_attr_list)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[2], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], True, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[2], row=1, col=2)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "DSL" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if "Fiber optic" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#FFA15A")) if "No phone service" in trace.name else (),)

 
fig.update_layout(height=400, width=1000, barmode="stack", title_text="PHONESERVICE Distribution")
fig.show()

* Customers with DSL, show in avergae that ~53% of them do not hire online security, tech support, online backup or device protection services.
* In contrast, clients with optic fiber have a clear preference for not adding to their contract online security and tech support (72.9% and 79.76% respectively). This is not the case however for online backup and device protection, in this case these customers do not have a clear preference in acquiring or not these services (~56% for both).

In [None]:
sub_attr_list = ["ONLINESECURITY", "TECHSUPPORT","ONLINEBACKUP", "DEVICEPROTECTION"]
attr = "INTERNETSERVICE"

fig = make_subplots(rows=2, cols=2, subplot_titles=sub_attr_list)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[2], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[2], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[2], False, True)[0], row=2, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[2], False, True)[1], row=2, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[2], False, True)[2], row=2, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[3], False, True)[0], row=2, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[3], False, True)[1], row=2, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[3], False, True)[2], row=2, col=2)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=2, col=1)
fig.update_xaxes(title_text=attr, row=2, col=2)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "No internet service" in trace.name else (),)

 
fig.update_layout(height=600, width=1000, barmode="stack", title_text="INTERNETSERVICE Distribution")
fig.show()

* Customers with DSL prefer to not acquire tv or movies streaming services (60.55% and 59.44% respectively).
* Clients with optic fiber slightly prefer hiring streaming services (~56% for both types).

In [None]:
sub_attr_list = ["STREAMINGTV", "STREAMINGMOVIES"]
attr = "INTERNETSERVICE"

fig = make_subplots(rows=1, cols=2, subplot_titles=sub_attr_list)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[0], True, True)[2], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, sub_attr_list[1], False, True)[2], row=1, col=2)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "No internet service" in trace.name else (),)

 
fig.update_layout(height=400, width=1000, barmode="stack", title_text="INTERNETSERVICE Distribution")
fig.show()

### Financial & Contractual Attributes

* Tenure, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges are considered hired services attributes.
* 55.1% of clients prefer a month to month contract. One year and two year contracts are both (20.93% and 23.96%) almost equally prefered between the rest of clients.
* A majority of customers prefer paperless billing 59.27%.
* Of the available four payment methods, electronic check dominates with ~1/3 of customers. The other ~2/3 are split almost equally between them, with Mailed check (22.81% over total number of customers) slightly larger than the other two (both automatic bank transfer and credict card between 21-22%).
* Tenure's distribution shows that there are larger concentracion of clients with short tenures (0-15) and larger tenures (65-75).
* There is an inicial peak in the monthly charges distribution, a number of clients are charged monthly below 25.99. The rest of the clients seem to be normally distributed around ~75.
* Total charges per customer show that an exponential behaviour.

In [None]:
f_c_attrs = ["CONTRACT", "PAPERLESSBILLING", "PAYMENTMETHOD", "TENURE", "MONTHLYCHARGES", "TOTALCHARGES", "CHURN"]

df_f_c_attributes = df_churn[f_c_attrs].copy()

fig = make_subplots(rows=2, cols=3, subplot_titles=f_c_attrs)

fig.add_trace(plot_bar_chart(df_f_c_attributes, f_c_attrs[0], return_trace=True), row=1, col=1)
fig.add_trace(plot_bar_chart(df_f_c_attributes, f_c_attrs[1], return_trace=True), row=1, col=2)
fig.add_trace(plot_bar_chart(df_f_c_attributes, f_c_attrs[2], return_trace=True), row=1, col=3)
fig.add_trace(plot_histogram(df_f_c_attributes, f_c_attrs[3], return_trace=True), row=2, col=1)
fig.add_trace(plot_histogram(df_f_c_attributes, f_c_attrs[4], return_trace=True), row=2, col=2)
fig.add_trace(plot_histogram(df_f_c_attributes, f_c_attrs[5], return_trace=True), row=2, col=3)

fig.update_yaxes(range=[0, 4500], row=1, col=1)
fig.update_yaxes(range=[0, 4500], row=1, col=2)
fig.update_yaxes(range=[0, 4500], row=1, col=3)
fig.update_yaxes(range=[0, 1250], row=2, col=1)
fig.update_yaxes(range=[0, 1250], row=2, col=2)
fig.update_yaxes(range=[0, 1250], row=2, col=3)

fig.update_layout(height=600, width=1000, barmode="stack", title_text="Finantial & Contractual Distributions")
fig.show()

* Month-to-month contract type dominates short tenures, following an exponential distribution which fades out at tenure values 72-73. 
* One year contracts seem to have a normal tenure distribution (centered around 40 and negatively skewed). 
* Two year contracts show an exponential tenure growth trend, with the largest number of contracts located at the largest tenures.
* Regarding monthly charges, the already mentioned peak seems to be equally distributed between all three contracts. However, the rest of the charges are mainly comprised by month-to-month contracts.
* The exponential distribution of the total charges is created by the month-to-month contracts. 
* Starting at 8000 total charges we only find the the two year contract. This contract seems to have to normal distributions, one centered around 1000 and the other around 6000 total charges.
* One year contracts are also exponentially distributed, however their distribution of total charges is not as strong as month-to-month contracts.

In [None]:
attr = "CONTRACT"
types_attr = ["Month-to-month", "One year", "Two year"]
df_contract_mtm = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[0]].copy()
df_contract_oy = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[1]].copy()
df_contract_ty = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[2]].copy()

sub_attr_list = ["TENURE", "MONTHLYCHARGES", "TOTALCHARGES"]

fig = make_subplots(rows=1, cols=3, subplot_titles=sub_attr_list)

fig.add_trace(plot_histogram(df_contract_mtm, sub_attr_list[0], types_attr[0], return_trace=True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_oy, sub_attr_list[0], types_attr[1], return_trace=True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_ty, sub_attr_list[0], types_attr[2], return_trace=True), row=1, col=1)

fig.add_trace(plot_histogram(df_contract_mtm, sub_attr_list[1], types_attr[0], False, True),row=1, col=2)
fig.add_trace(plot_histogram(df_contract_oy, sub_attr_list[1], types_attr[1], False, True), row=1, col=2)
fig.add_trace(plot_histogram(df_contract_ty, sub_attr_list[1], types_attr[2], False, True), row=1, col=2)

fig.add_trace(plot_histogram(df_contract_mtm, sub_attr_list[2], types_attr[0], False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_oy, sub_attr_list[2], types_attr[1], False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_ty, sub_attr_list[2], types_attr[2], False, True), row=1, col=3)

fig.update_yaxes(range=[0, 1250], row=1, col=1)
fig.update_yaxes(range=[0, 1250], row=1, col=2)
fig.update_yaxes(range=[0, 1250], row=1, col=3)


fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if trace.name == types_attr[0] else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if trace.name == types_attr[1] else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if trace.name == types_attr[2] else (),)

fig.update_layout(height=400, width=1000,barmode="stack", title="CONTRACT Distribution", 
                  legend_title_text="Contract type")
fig.show()

* Papeless billing dominates the tenure distribution over non-paperless billing.
* For monthly charges under 25.99 the non-paperless billing dominates. After that, paperless billing dominates.
* At lower values of the total charges distribution (< 2000) both types of billing contribute almost equally, however, as the total charges increase non-paperless billing is reduced until completely disappear at > 8000 total charges.

In [None]:
attr = "PAPERLESSBILLING"
types_attr = ["Yes", "No"]
df_contract_pb_y = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[0]].copy()
df_contract_pb_n = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[1]].copy()

sub_attr_list = ["TENURE", "MONTHLYCHARGES", "TOTALCHARGES"]

fig = make_subplots(rows=1, cols=3, subplot_titles=sub_attr_list)

fig.add_trace(plot_histogram(df_contract_pb_y, sub_attr_list[0], types_attr[0], True, True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_pb_n, sub_attr_list[0], types_attr[1], True, True), row=1, col=1)

fig.add_trace(plot_histogram(df_contract_pb_y, sub_attr_list[1], types_attr[0], False, True),row=1, col=2)
fig.add_trace(plot_histogram(df_contract_pb_n, sub_attr_list[1], types_attr[1], False, True), row=1, col=2)

fig.add_trace(plot_histogram(df_contract_pb_y, sub_attr_list[2], types_attr[0], False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_pb_n, sub_attr_list[2], types_attr[1], False, True), row=1, col=3)

fig.update_yaxes(range=[0, 1250], row=1, col=1)
fig.update_yaxes(range=[0, 1250], row=1, col=2)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if trace.name == types_attr[0] else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if trace.name == types_attr[1] else (),)

fig.update_layout(height=400, width=1000,barmode="stack", title="PAPERLESSBILLING Distribution", 
                  legend_title_text="Paperless billing type")
fig.show()

* Tenure is initially dominated by electronic and mail checks. However, as the tenures increase they start to decrease their presence and at the largest tenure values credit card and bank transfers are the largest contributers.
* The peak in monthly charges is mainly created by mailed checks and at a significant lesser degree by bank transfers and credit cards.
* In contrast, electronic checks have a very small presence in the initial monthly charges peak, but as the charges grow so does the contribution of electronic checks to the point that it becomes the dominant method for payment in the normal distribution. The other three combined contribute as much as electronic checks does.
* Electronic and mail checks dominate the exponential distribution of total charges at values < 1400. At that point, their presence decreases meanwhile bank transfers and credit cards see their numbers increase. This tendency continues until at around > 8000 total charges, at which at that point electronic and mail checks are not present at all (mailed disappears at > 6000 value).

In [None]:
attr = "PAYMENTMETHOD"
types_attr = ["Electronic check", "Mailed check", "Bank transfer (automatic)", "Credit card(automatic)"]
df_contract_pm_ec = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[0]].copy()
df_contract_pm_mc = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[1]].copy()
df_contract_pm_bt = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[2]].copy()
df_contract_pm_cc = df_f_c_attributes.loc[df_f_c_attributes[attr] == types_attr[3]].copy()

sub_attr_list = ["TENURE", "MONTHLYCHARGES", "TOTALCHARGES"]

fig = make_subplots(rows=1, cols=3, subplot_titles=sub_attr_list)

fig.add_trace(plot_histogram(df_contract_pm_ec, sub_attr_list[0], types_attr[0], True, True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_pm_mc, sub_attr_list[0], types_attr[1], True, True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_pm_bt, sub_attr_list[0], types_attr[2], True, True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_pm_bt, sub_attr_list[0], types_attr[3], True, True), row=1, col=1)

fig.add_trace(plot_histogram(df_contract_pm_ec, sub_attr_list[1], types_attr[0], False, True), row=1, col=2)
fig.add_trace(plot_histogram(df_contract_pm_mc, sub_attr_list[1], types_attr[1], False, True), row=1, col=2)
fig.add_trace(plot_histogram(df_contract_pm_bt, sub_attr_list[1], types_attr[2], False, True), row=1, col=2)
fig.add_trace(plot_histogram(df_contract_pm_bt, sub_attr_list[1], types_attr[3], False, True), row=1, col=2)

fig.add_trace(plot_histogram(df_contract_pm_ec, sub_attr_list[2], types_attr[0], False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_pm_mc, sub_attr_list[2], types_attr[1], False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_pm_bt, sub_attr_list[2], types_attr[2], False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_pm_bt, sub_attr_list[2], types_attr[3], False, True), row=1, col=3)

fig.update_yaxes(range=[0, 1250], row=1, col=1)
fig.update_yaxes(range=[0, 1250], row=1, col=2)
fig.update_yaxes(range=[0, 1250], row=1, col=3)


fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if trace.name == types_attr[0] else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if trace.name == types_attr[1] else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if trace.name == types_attr[2] else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if trace.name == types_attr[3] else (),)
              

fig.update_layout(height=400, width=1000,barmode="stack", title="PAYMENTMETHOD Distribution", 
                  legend_title_text="Payment method")
fig.show()

### Churn Attribute

* 73.42% of customers in this data set do not churn.

In [None]:
attr = "CHURN"
plot_bar_chart(df_churn, attr)

#### Churn by Customer Attributes

* Gender has no effect on the churn.
* Customers with or without a partner are equal with a negative churn. However, when churn is positive those without partner dominate (64.21%).
* Non-senior clients comprise the large majority of clients that do and do not churn (87.1% and 74.53% respectively). 
* Clients with no dependents dmoinate in both churn turnouts, with 82.56% for churn and 65.66% for not churn.

In [None]:
fig = make_subplots(rows=2, cols=2, subplot_titles=customer_attrs)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[0], True, True)[1], row=1, col=1)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[1], True, True)[1], row=1, col=2)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[2], False, True)[0], row=2, col=1)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[2], False, True)[1], row=2, col=1)

fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[3], False, True)[0], row=2, col=2)
fig.add_trace(plot_stack_bar_chart(df_customer_attributes, attr, customer_attrs[3], False, True)[1], row=2, col=2)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=2, col=1)
fig.update_xaxes(title_text=attr, row=2, col=2)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "Male" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if "Female" in trace.name else (),)
 
fig.update_layout(height=600, width=1000, barmode="stack", title_text="Churn by Customer Attributes")
fig.show()

### Churn by Hired Services Attributes

* Both churn outputs show that those with phone service significantly dominate (90.9%).
* Clients without multiple lines are larger when then there is no churn (2536 vs 2117). When there is churn, clients with or without multiple lines are virtually the same (850 vs 849).
* DSL internet service is slightly larger than optic fiber services with a negative churn. However, when there is churn, the large majority of clients have optic fiber (69.4%).
* For online security, online backup, tech support and device protection there is a common pattern in at which customers with a negative churn are dominated by those that have not hired these services (~37%). Then those that have hired the service follow (~36) and finally those without internet are the rest (~27%).
* For these same services, those customers with a positive churn show are mainly those that have not hired these services (~70%).
* Regarding streaming services, both clients with and without churn are almost equally distributed between those with or without these services (excluding clients without any internet). However, when the churn is negative, clients with these services slightly dominate and, clients without these servicies slightly dominate when the churn is positive (~65%).

In [None]:
fig = make_subplots(rows=3, cols=3, subplot_titles=services_attrs)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[0], False, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[0], False, True)[1], row=1, col=1)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[1], True, True)[1], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[1], True, True)[2], row=1, col=2)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[2], True, True)[0], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[2], True, True)[1], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[2], False, True)[2], row=1, col=3)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[3], False, True)[0], row=2, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[3], True, True)[1], row=2, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[3], False, True)[2], row=2, col=1)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[4], False, True)[0], row=2, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[4], False, True)[1], row=2, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[4], False, True)[2], row=2, col=2)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[5], False, True)[0], row=2, col=3)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[5], False, True)[1], row=2, col=3)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[5], False, True)[2], row=2, col=3)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[6], False, True)[0], row=3, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[6], False, True)[1], row=3, col=1)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[6], False, True)[2], row=3, col=1)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[7], False, True)[0], row=3, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[7], False, True)[1], row=3, col=2)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[7], False, True)[2], row=3, col=2)

fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[8], False, True)[0], row=3, col=3)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[8], False, True)[1], row=3, col=3)
fig.add_trace(plot_stack_bar_chart(df_services_attributes, attr, services_attrs[8], False, True)[2], row=3, col=3)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=1, col=3)
fig.update_xaxes(title_text=attr, row=2, col=1)
fig.update_xaxes(title_text=attr, row=2, col=2)
fig.update_xaxes(title_text=attr, row=2, col=3)
fig.update_xaxes(title_text=attr, row=3, col=1)
fig.update_xaxes(title_text=attr, row=3, col=2)
fig.update_xaxes(title_text=attr, row=3, col=3)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if "Yes" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if "No" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#AB63FA")) if "phone service" in trace.name else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#00CC96")) if "internet service" in trace.name else (),)
 
fig.update_layout(height=800, width=1000, barmode="stack", title_text="Churn by Hired Service Attributes")
fig.show()

### Churn by Financial & Contractual Attributes

* By contract, negative churn is dominated by month-to-month contracts (2220), followed by two years (1637) and finally one year contracts (1306).
* In the case of positive churn, the month-to-month contract consist of the large majority of contracts (88.55%), meanwhile 2.57% of the contracts are two years and one year contracts are 8.88%.
* Paperless billing is larger than no paperless billing in both churn options. However, when there is a negative churn it is slightly larger (53.61%) and significantly larger (74.9%) with positive churns.
* With respect to payment methods, customers without churn are roughly equally distributed between all four payment methods.
* However, customers with churn prefer to pay via electronic check (57.3%) in contrast with the other three.

In [None]:
fig = make_subplots(rows=1, cols=3, subplot_titles=f_c_attrs[:3])

fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[0], True, True)[0], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[0], True, True)[1], row=1, col=1)
fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[0], True, True)[2], row=1, col=1)

fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[1], True, True)[0], row=1, col=2)
fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[1], True, True)[1], row=1, col=2)

fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[2], True, True)[0], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[2], True, True)[1], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[2], True, True)[2], row=1, col=3)
fig.add_trace(plot_stack_bar_chart(df_f_c_attributes, attr, f_c_attrs[2], True, True)[3], row=1, col=3)

fig.update_xaxes(title_text=attr, row=1, col=1)
fig.update_xaxes(title_text=attr, row=1, col=2)
fig.update_xaxes(title_text=attr, row=1, col=3)
 
fig.update_layout(height=400, width=1000, barmode="stack", title_text="Churn by Contractual Attributes")
fig.show()

* Although positive churn dominates at tenures equal to 1, beyond this value negative churn dominates. 
* In fact, positive churn diminishes as tenure increases to the point that it is basically insignificant (tenure 72). 
* Negative churn dominates at all monthly charge values.
* Both churns follow an exponential distribution with respect to totalcharges. 
* Negative churn dominates over positive churn at all values.
* At total charges > 8000, positive churn is not present.

In [None]:
df_contract_ch_y = df_f_c_attributes.loc[df_f_c_attributes[attr] == "Yes"].copy()
df_contract_ch_n = df_f_c_attributes.loc[df_f_c_attributes[attr] == "No"].copy()

fig = make_subplots(rows=1, cols=3, subplot_titles=f_c_attrs[3:])

fig.add_trace(plot_histogram(df_contract_ch_y, f_c_attrs[3], "Yes", True, True), row=1, col=1)
fig.add_trace(plot_histogram(df_contract_ch_n, f_c_attrs[3], "No", True, True), row=1, col=1)

fig.add_trace(plot_histogram(df_contract_ch_y, f_c_attrs[4], "Yes", False, True),row=1, col=2)
fig.add_trace(plot_histogram(df_contract_ch_n, f_c_attrs[4], "No", False, True), row=1, col=2)

fig.add_trace(plot_histogram(df_contract_ch_y, f_c_attrs[5], "Yes", False, True), row=1, col=3)
fig.add_trace(plot_histogram(df_contract_ch_n, f_c_attrs[5], "No", False, True), row=1, col=3)

fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#636EFA")) if trace.name == "Yes" else (),)
fig.for_each_trace(lambda trace: trace.update(marker=dict(color="#EF553B")) if trace.name == "No" else (),)

fig.update_layout(height=400, width=1000, barmode="stack", title="Churn by Financial Attributes",
                 legend_title_text="Churn")
fig.show()

## Model Selection and Training

* Before training a model/s, the data will be split (into training and testing data) and then encoded.
* Two models are trained and tested. These are Random Forest Classifier and Support Vector Machine Classifier.

### Data Splitting

* Data is split into training and testing data. 
* The churn column of the dataset is taking as the labels (y).
* All other columns (except for the already deleted customerID) are considered the input data (X).

In [None]:
X = df_churn.iloc[:, :-1].values
y = df_churn["CHURN"].values

print("X shape:\n", X.shape)
print("y shape:\n", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123)

print("X_train shape:\n", X_train.shape)
print("X_test shape:\n", X_test.shape)
print("y_train shape:\n", y_train.shape)
print("y_test shape:\n", y_test.shape)

### Data Encoding

* Categorical binary attributes are label encoded (Yes = 1, No = 0).
> * These are: Churn, gender, Partner, Dependents, PhoneService and PaperlessBilling.
> * SeniorCitizen is not label encoded since it already is (it has values of 1 and 0).
* Categorical non binary attributes are one hot encoded.
> * These are: MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, and PaymentMethod.
* Non/categorical data is excluded from encoding.
> * These are: SeniorCitizen, Tenure, MonthlyCharges and TotalCharges.
* Test data is encoded with the trained encoders from fitting the training data.

In [None]:
# Encoders are created
ohe = OneHotEncoder(drop="if_binary", handle_unknown="error")
le = LabelEncoder()

num_col = [1, 4, 17, 18]  # These numbers correspond to the columns of non categorical data.

# Categorical and non-categorical data is separated.
X_train_cat = np.delete(X_train, num_col, 1)
X_train_num = X_train[:,num_col]

X_test_cat = np.delete(X_test, num_col, 1)
X_test_num = X_test[:,num_col]

# Categorical data is encoded
X_train_cat = ohe.fit_transform(X_train_cat)
X_test_cat = ohe.transform(X_test_cat)

feature_importance_index = ohe.get_feature_names()

X_train_cat = X_train_cat.toarray()
X_test_cat = X_test_cat.toarray()

# Categorical encoded data and non-categorical data are unified
X_train = np.hstack((X_train_cat, X_train_num))
X_test = np.hstack((X_test_cat, X_test_num))

# Labels (churn) is label encoded
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

print("X_train shape:\n", X_train.shape)
print("X_test shape:\n", X_test.shape)
print("y_train shape:\n", y_train.shape)
print("y_test shape:\n", y_test.shape)

### Random Forest Classifier

* Random Forest Classifier is a powerful algorithm with a low bias and an average variance.
* Another strong reason to use this classifier is becuase the data does not require scaling or reducing its dimensions.
* In particular, this data set is relatively small and thus although this algorithm is relatively computationally costly, it is still feasible within the purpose of this notebook.

#### Model Training

* To train the model, a k-fold grid search cross validation is used to estimate the best parameters for the model.
* This segment of code is commented due to the time it takes to execute (~5 mins). However the best results are shown below.
* The best parameters are the following:

_Note: Change cell below to code in order to run the grid search._

#Create model
rfc = RandomForestClassifier(random_state=123)

#Select the k-fold value for the cross validation
k = 5  
kfold = KFold(n_splits=k)

#Define the parameter grid for the grid search cv
n_estimators = [50, 100, 200, 300]
max_depth = [10, 20, 30, 50]
min_samples_split = [1, 2, 5, 10]
min_samples_leaf = [1, 2, 5, 10]

param_grid = dict(n_estimators = n_estimators, max_depth = max_depth, min_samples_split = min_samples_split, 
                  min_samples_leaf = min_samples_leaf)

#Execute the grid search cv
rfc_grid = GridSearchCV(rfc, param_grid, cv = kfold, verbose = 3, n_jobs = -1)

#Get best parameters
rfc_grid.fit(X_train, np.ravel(y_train))
best_params = rfc_grid.best_params_
print("best parameters:", best_params)

In [None]:
best_params = {"max_depth": 10, "min_samples_leaf": 2, "min_samples_split": 10, "n_estimators": 300}
rfc = RandomForestClassifier(**best_params, random_state=123)
rfc.fit(X_train, np.ravel(y_train))

#### Model Testing Error

* A classification report and a confusion matrix are created for evaluating the testing error of the model.

In [None]:
print(feature_importance_index.tolist() + ["SENIORCITIZEN", "TENURE", "MONTHCHARGE", "TOTALCHARGE"])
importances = rfc.feature_importances_
weights = pd.Series(importances, index=feature_importance_index.tolist() + ["SENIORCITIZEN", "TENURE", "MONTHCHARGE", "TOTALCHARGE"])
weights.sort_values()[-10:].plot(kind = 'barh')

In [None]:
y_pred = rfc.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")

data = confusion_matrix(y_test, y_pred)
df_confusion_matrix = pd.DataFrame(data, columns=np.unique(y_test), index=np.unique(y_test))
df_confusion_matrix.index.name, df_confusion_matrix.columns.name = 'Actual', 'Predicted'
plt.figure(figsize=(8, 5))
seaborn.set(font_scale=0.9)
seaborn.heatmap(df_confusion_matrix, cmap="Reds", annot=True, annot_kws={"size": 16}, fmt='d');

### Suport Vector Machine Classifier

* The main reason to use this powerful algorithm is because this is a two class classification problem.

#### MinMaxScaling

* All four non-categorical columns are scaled. 
* To preserve the shape of the dataset (no distortion), they are min max scaled to values between (0, 1) instead of standard scaled.

In [None]:
scaler = MinMaxScaler()

print("X_train_num:\n", X_train_num)
print("X_test_num:\n", X_test_num)

X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

print("X_train_num:\n", X_train_num)
print("X_test_num:\n", X_test_num)

X_train = np.hstack((X_train_cat, X_train_num))
X_test = np.hstack((X_test_cat, X_test_num))

#### Model Training

* To train the model, a k-fold grid search cross validation is used to estimate the best parameters for the model.
* This segment of code is commented due to the time it takes to execute (~4 mins). However the best results are shown below.
* The best parameters are the following:

_Note: Change cell below to code in order to run the grid search._

#Create model
clf = SVC(random_state=123)

#Create the k-folds for cv
k = 5  
kfold = KFold(n_splits=k)

#Generate the parameters for the grid search cv
param_grid = {"C": [0.1,1,10], "gamma": [0.01, 0.1, 1, 10, 100],"kernel": ["linear", "rbf"]}

#Apply the grid search cv
grid = GridSearchCV(clf, param_grid, cv = kfold, verbose=1)

#Get best params
grid.fit(X_train,y_train)
best_params = grid.best_estimator_
print("best parameters:", best_params)

In [None]:
best_params = {"C": 1, "gamma": 0.01, "kernel": "linear"}
clf = SVC(**best_params, random_state=123)
clf.fit(X_train, np.ravel(y_train))

#### Model Testing Error

* A classification report and a confusion matrix are created for evaluating the testing error of the model.

In [None]:
y_pred = clf.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred), "\n")

data = confusion_matrix(y_test, y_pred)
df_confusion_matrix = pd.DataFrame(data, columns=np.unique(y_test), index=np.unique(y_test))
df_confusion_matrix.index.name, df_confusion_matrix.columns.name = 'Actual', 'Predicted'
plt.figure(figsize=(8, 5))
seaborn.set(font_scale=0.9)
seaborn.heatmap(df_confusion_matrix, cmap="Reds", annot=True, annot_kws={"size": 16}, fmt='d');

## Conclusions

### Best performing model

* Support Vector Machine classifies better than the Random Forest Classifier (80.53% and 80.1% accuracy respectively).
* The precision of the SVM is 83% for those clients that do not churn, and 71% for those that do churn.
* Grid search CV has proven that a linear kernel fits better the data. This is also understood from the nature of the labels (1,0).

### Churn Behaviour

#### Reasons Behind Customer Churn

* Senior citizens and customers without dependents are the largest groups that do churn.
* Customers with internet services comprise the large majority of clients that churn.
* Of those with internet, they are more likely to churn if the internet service is optic fiber.
* Clients without device protection, tech support, online backup and online security also comprise the majority of clients that churn.
* Regarding the payment beaviour of the clients that do churn, those that have month-month contracs, use paperless billing and pay via electronic checks are the most likely to churn compared to other behaviours.
* __Tenure is an important factor that determines Churn__. Clients with only one tenure tend to churn, and although as positive churn decreases with the tenure, it is not until tenure > 20 that clients that churn become a non significant minority (and disappear at tenure >72).
* Monthly charges between 70-105 play a significant role in increasing the churn of the clients.
* Lastly, clients that churn follow an exponential distribution with regard to the totalcharges. Thus are the total charges increases the number of clients that churn becomes insignificant compared to those that do not.
* Below the feature importance output from the Random Forest Classifier:

In [None]:
non_categorical_attrs = ["SENIORCITIZEN", "TENURE", "MONTHLYCHARGES", "TOTALCHARGES"]
df_feature_importance = df_churn.drop(non_categorical_attrs + ["CHURN"], axis=1)
churn_attributes = df_feature_importance.columns.tolist()
encoded_attributes = feature_importance_index.tolist() + ["SENIORCITIZEN", "TENURE", "MONTHCHARGE", "TOTALCHARGE"]

for i, attr_name in enumerate(churn_attributes):
    encoded_attributes = [attr.replace("x" + str(i) + "_", attr_name + "_") for attr in encoded_attributes]
    
importances = rfc.feature_importances_
weights = pd.Series(importances, index=encoded_attributes)
weights.sort_values()[-10:].plot(kind = 'barh');

#### How To Avoid Customer Churn

* __The major factor is tenure__. During the first tenure month the largest number of clients churn.
* To avoid this, several strategies can be implemented to also affect the features that influence churn.
* For example, __clients without additional internet services (device protection, tech support, online backup and online security) tend to churn more__. Thus, offering these services for (e.g.) __6 months for free__ when the client is new, may substantially prolong the tenure of the client at the company and thus avoid its churn.
* Another strategy to prolong the tenure of a client and thus the possibility of him/her churning is to __change the contract type to a non mont-to-month contract__. Another option is to __decrease the monthly charges for the first 6 months (with a discount)__.

### What can be improved

* The model suffers from a recall of 55%. This can be improved by adding more data of clients that do churn.
* Testing different models (e.g.: XG Boost or a NN) and also increasing the hypertunning parameters of the models may be improve the accuracy. However, due to the time restrictions and the nature of this notebook, a more in depth analysis of several models and tunning their parameters (with for example RandomGridSearchCV) would be considerately time consuming.