In [None]:
import pandas as pd
import numpy as np

# Visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

from xgboost import XGBRegressor
from xgboost import plot_importance

In [None]:
csv_filepath = "../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv"
data = pd.read_csv(csv_filepath)
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
def plot_pie(df):
  labels = ["No", "Yes"]
  values = df["Churn"].value_counts().to_list()

  colors = ['gold', 'royalblue']

  # Pie plot
  fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.4)])
  fig.update_traces(hoverinfo="label+value+text", textfont_size=20, textinfo="value",
                   marker=dict(colors=colors, line=dict(color="white", width=2)))
  fig.update_layout(dict(title="Customer Churn"))
  fig.show()

In [None]:
plot_pie(data)

**Distribution customers analysis**

In [None]:
def distribution_pie_plot(df, column):
  churn = df[df["Churn"] == "Yes"]
  no_churn = df[df["Churn"] == "No"]


  # Create subplots: use 'domain' type for Pie subplot
  fig = make_subplots(rows=1, cols=2, specs=[[{"type" : "domain"}, {"type": "domain"}]])
  # Churn
  fig.add_trace(go.Pie(values=churn[column].value_counts().values.tolist(), 
                       labels=churn[column].value_counts().keys().tolist(), 
                       name="Churn"),
                1, 1)
  # No Churn
  fig.add_trace(go.Pie(values=no_churn[column].value_counts().values.tolist(), 
                       labels=no_churn[column].value_counts().keys().tolist(), 
                       name="No Churn"),
                1, 2)
  
  fig.update_traces(hole=.4, hoverinfo="label+percent+name")

  fig.update_layout(title_text= column + "\n" + "Distribution Customer Analysis",
                     # Add annotations in the center of the donut pies
                     annotations=[
                                  dict(text="Churn", x=0.18, y=0.5, font_size=20, showarrow=False),
                                  dict(text="No Churn", x=0.82, y=0.5, font_size=20, showarrow=False)
                     ])
  fig.show()

In [None]:
category_columns = ["Contract", "gender", "Partner",	"Dependents", "PhoneService", "MultipleLines", "OnlineSecurity","OnlineBackup",	
                    "DeviceProtection",	"TechSupport",	"StreamingTV", "StreamingMovies" , "PaperlessBilling", "PaymentMethod"]

#for all categorical columns plot pie and distribution    
for col in category_columns:
    distribution_pie_plot(data, col)

**Histogram for distribution of numerical columns**

In [None]:
def distribution_histogram(df, column):
  churn = df[df["Churn"] == "Yes"]
  no_churn = df[df["Churn"] == "No"]


  # Create subplots: use 'domain' type for Pie subplot
  fig = go.Figure()
  # Churn
  fig.add_trace(go.Histogram(x=churn[column],
                             histnorm="percent",
                             name="Churn",
                             marker=dict(line = dict(width=0.5, color="black")),
                             opacity=0.75)
  )
  # No Churn
  fig.add_trace(go.Histogram(x=no_churn[column],
                             histnorm="percent",
                             name="No Churn",
                             marker=dict(line = dict(width=0.5, color="black")),
                             opacity=0.75)
  )
  
  fig.update_layout(title_text= column + "\n" + "Histogram Customer Analysis",
                    bargap=0.2,
                    bargroupgap=0.1,
                    # xaxis label
                     xaxis = dict(gridcolor="white",
                                  title = column,
                                  zerolinewidth=1,
                                  ticklen=5,
                                  gridwidth=2), 
                    # yaxis label
                    yaxis = dict(gridcolor="white",
                                title = "percent",
                                zerolinewidth=1,
                                ticklen=5,
                                gridwidth=2)
                    )
  fig.show()

In [None]:
num_columns = ["SeniorCitizen", "tenure", "MonthlyCharges",	"TotalCharges"]
for col in num_columns:
  distribution_histogram(data, col)

**Correlation**

In [None]:
category_columns = ["Contract", "gender", "Partner",	"Dependents", "PhoneService", "MultipleLines", 
                    "InternetService", "OnlineSecurity","OnlineBackup", "DeviceProtection",	"TotalCharges",
                    "TechSupport",	"StreamingTV", "StreamingMovies" , "PaperlessBilling", "PaymentMethod", "Churn"]

# Encode Categorical Columns
labelencoder = LabelEncoder()
data[category_columns] = data[category_columns].apply(labelencoder.fit_transform)
data.head(5)

In [None]:
def get_correlation(df):
  correlation = df.corr()
  # Labels
  cols_matrix = correlation.columns.tolist()
  # Convert to numpy array 
  correlation_arr = np.array(correlation)

  # Plot
  fig = go.Figure()
  fig.add_trace(go.Heatmap(x = cols_matrix,
                           y = cols_matrix,
                           z = correlation_arr,
                           colorscale = "Viridis",
                           colorbar = dict(title = "Pearson Correlation coefficient",
                                           titleside = "right"))
  )
  fig.update_layout(dict(title = "Correlation Matrix",
                      height = 770,
                      width = 900,
                      autosize = False,
                      yaxis = dict(tickfont = dict(size = 9)),
                      xaxis = dict(tickfont = dict(size = 9)),
                      )
  )
  fig.show()

In [None]:
get_correlation(data)

# XGBoost Model

**Prepare train set and test set**

In [None]:
Y = data["Churn"]
X = data.drop(["Churn", "customerID"], axis=1)
print("Shape of X = {}".format(X.shape))
print("Shape of Y = {}".format(Y.shape))

# Normalize data
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

# Separate train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

print("Shape of X_train = {}".format(X_train.shape))
print("Shape of X_test = {}".format(X_test.shape))
print("Shape of Y_train = {}".format(Y_train.shape))
print("Shape of Y_test = {}".format(Y_test.shape))

**Build the model**

In [None]:
import xgboost as xgb

In [None]:
# use DMatrix for xgbosot
d_train_Matrix = xgb.DMatrix(X_train, label=Y_train)
d_test_Matrix = xgb.DMatrix(X_test, label=Y_test)

# set xgboost params
param = {
    "max_depth" : 3,
    "eta" : 0.3,
    "silent" : 1,
    "objective": "multi:softprob",
    "num_class" : 2,

}

# the number of training iterations
num_iteration = 20 

# training and testing - numpy matrices
model = xgb.train(param, d_train_Matrix, num_iteration)
# Prediction
pred = model.predict(d_test_Matrix)

# extracting most confident predictions
best_pred = np.asarray([np.argmax(feature) for feature in pred])

# Eval model
print("Accuracy score = {}".format(accuracy_score(Y_test, best_pred)))
print("Percision score = {}".format(precision_score(Y_test, best_pred, average="macro")))
print("Recall score = {}".format(recall_score(Y_test, best_pred, average="macro")))

**Feature importance**

In [None]:
plot_importance(model)
plt.show()